001 package org.nlp2rdf.implementation.opennlp;
002
003 import com.hp.hpl.jena.ontology.OntClass;
004 import com.hp.hpl.jena.ontology.OntModel;
005 import com.hp.hpl.jena.ontology.OntModelSpec;
006 import com.hp.hpl.jena.rdf.model.ModelFactory;
007 import com.hp.hpl.jena.util.iterator.ExtendedIterator;
008 import eu.lod2.nlp2rdf.schema.Thing;
009 import eu.lod2.nlp2rdf.schema.sso.Word;
010 import eu.lod2.nlp2rdf.schema.str.Document;
011 import eu.lod2.nlp2rdf.schema.tools.Factory;
012 import opennlp.tools.postag.POSModel;
013 import opennlp.tools.postag.POSTaggerME;
014 import opennlp.tools.sentdetect.SentenceDetectorME;
015 import opennlp.tools.sentdetect.SentenceModel;
016 import opennlp.tools.tokenize.TokenizerME;
017 import opennlp.tools.tokenize.TokenizerModel;
018 import opennlp.tools.util.Span;
019 import org.nlp2rdf.core.Text2RDF;
020 import org.nlp2rdf.core.URIGenerator;
021 import org.nlp2rdf.core.util.URIGeneratorHelper;
022 import org.nlp2rdf.ontology.olia.OLiAManager;
023 import org.nlp2rdf.ontology.olia.OLiAOntology;
024 import org.slf4j.Logger;
025 import org.slf4j.LoggerFactory;
026
027 import java.io.*;
028 import java.util.HashSet;
029 import java.util.List;
030 import java.util.Set;
031
032 public class OpenNLPWrapper {
033
034 private final OLiAOntology penn;
035 private static Logger log = LoggerFactory.getLogger(OpenNLPWrapper.class);
036
037 //the model is threadsafe according to the javadoc
038 private static TokenizerModel tokenizerModel = null;
039 private static SentenceModel sentenceModel = null;
040 private static POSModel posmodel = null;
041
042 private TokenizerME tokenizerME = null;
043 private SentenceDetectorME sentenceDetectorME = null;
044 private POSTaggerME posTaggerME = null;
045
046 static {
047 /***************************
048 * Important requirement...
049 */
050 Factory.registerCustomClasses();
051 }
052
053 public OpenNLPWrapper(OLiAManager oLiAManager) {
054 penn = oLiAManager.getOLiAOntology("http://purl.org/olia/penn-link.rdf");
055 tokenizerME = new TokenizerME(getTokenizerModel());
056 sentenceDetectorME = new SentenceDetectorME(getSentenceModel());
057 posTaggerME = new POSTaggerME(getPOSModel());
058 }
059 public static void main(String[] args) throws Exception {
060 try {
061 File input = null;
062 String outfile = null;
063 String format = null;
064 String urirecipe = null;
065 String prefix = null;
066
067 // Read and validate command line arguments
068 boolean validArgs = false;
069 if (args.length == 3) {
070 input = new File(args[1]);
071 urirecipe = args[2];
072 prefix = args[3];
073 validArgs = input.exists() && !input.isDirectory();
074 validArgs = validArgs && (urirecipe.equalsIgnoreCase("offset") || urirecipe.equalsIgnoreCase("context-hash"));
075 }
076 if (!validArgs) {
077 printUsageMessage();
078 } else {
079 if (!input.isDirectory()) {
080 OntModel m = ModelFactory.createOntologyModel(OntModelSpec.OWL_DL_MEM, ModelFactory.createDefaultModel());
081 OpenNLPWrapper wrap = new OpenNLPWrapper(new OLiAManager());
082 log.info("Processing file " + input);
083 String document = wrap.readFileAsString(input);
084 document = document.trim();
085 wrap.processText(prefix, document, urirecipe, m);
086 }
087 }
088 } catch (Exception e) {
089 e.printStackTrace();
090 }
091 }
092
093
094 /**
095 * Prints usage message.
096 */
097 private static void printUsageMessage() {
098 System.err.println("Usage: java org.nlp2rdf.wrapper.opennlp.OpenNLPWrapper "
099 + "<input file> <output filename> <urirecipe> <prefix>");
100 }
101
102 private String readFileAsString(File source) throws IOException {
103 final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(source)));
104 final byte[] buffer = new byte[(int) source.length()];
105 dis.readFully(buffer);
106 dis.close();
107 return new String(buffer);
108 }
109
110 public void processText(String prefix, String document, String urirecipe, OntModel model) throws IOException {
111
112 Span sentences[] = sentenceDetectorME.sentPosDetect(document);
113 Span tokens[] = null;
114 URIGenerator urigenerator = URIGeneratorHelper.determineGenerator(urirecipe);
115 Set<Span> spans = new HashSet<Span>();
116 //document span
117 spans.add(new Span(0, document.length()));
118 for (Span senspan : sentences) {
119 // // add the sentence annotation
120 spans.add(senspan);
121 tokens = tokenizerME.tokenizePos(senspan.getCoveredText(document).toString());
122 for (Span token : tokens) {
123 spans.add(token);
124 }
125 }
126 urigenerator.init(document, spans);
127 model.setNsPrefix("sso", Text2RDF.structuredSentenceOntologyUrl);
128 model.setNsPrefix("string", Text2RDF.stringOntologyUrl);
129
130 Document doc = null;
131 if (Document.list(model).isEmpty()) {
132 doc = new Text2RDF().createDocumentAnnotation(prefix, document,
133 urigenerator, model);
134 } else {
135 List<Document> ld = Document.list(model);
136 if (ld.size() == 1) {
137 doc = ld.get(0);
138 } else {
139 //TODO handle more than one
140 log.error("no document found or too many!");
141 }
142
143 }
144 for (Span senspan : sentences) {
145 // // add the sentence annotation
146 spans.add(senspan);
147 eu.lod2.nlp2rdf.schema.sso.Sentence s = eu.lod2.nlp2rdf.schema.sso.Sentence.create(urigenerator.makeUri(prefix, document, senspan), model);
148 s.setAnchorOf(senspan.getCoveredText(document).toString());
149 //add the class for the URI recipe
150 urigenerator.assignRecipeClass(s.getURI(), model);
151
152 //add substring annotation
153 if (doc != null && !(doc.getURI().equals(s.getURI()))) {
154 doc.addSubString(s);
155 }
156 tokens = tokenizerME.tokenizePos(senspan.getCoveredText(document).toString());
157 String postags[] = posTaggerME.tag(tokenizerME.tokenize(senspan.getCoveredText(document).toString()));
158 for (int i = 0; i < tokens.length; i++) {
159 Word w = Word.create(urigenerator.makeUri(prefix, document, tokens[i]), model);
160 //add the class for the URI recipe
161 urigenerator.assignRecipeClass(w.getURI(), model);
162 w.setAnchorOf(tokens[i].getCoveredText(document).toString());
163 w.addPosTag(postags[i]);
164 String oliaIndividual = null;
165 if ((oliaIndividual = penn.getIndividualURIForTag(postags[i])) != null) {
166 w.addOliaLink(Thing.create(oliaIndividual, model));
167 }
168
169 //adding pos classes from olia and olia-top
170 Set<String> classes = penn.getClassURIsForTag(postags[i]);
171 for (String classUri : classes) {
172 log.info("found: " + classUri + " for: " + postags[i]);
173 OntModel hierarchy = penn.getHierarchy(classUri);
174 for (ExtendedIterator<OntClass> it = hierarchy.listClasses(); it.hasNext(); ) {
175 OntClass oc = it.next();
176 //use all classes
177 //if (oc.getURI().startsWith("http://purl.org/olia/olia-top.owl") || oc.getURI().startsWith("http://purl.org/olia/olia.owl")) {
178 w.addOntClass(model.createResource(oc.getURI()));
179 //}
180 }
181 //Copy the hierarchy
182 model.add(hierarchy);
183 }
184 }
185 }
186 }
187
188
189 private SentenceModel getSentenceModel() {
190 if (sentenceModel == null) {
191 try {
192 InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream("org/nlp2rdf/implementation/opennlp/en-sent.bin");
193 try {
194 sentenceModel = new SentenceModel(modelIn);
195 } finally {
196 modelIn.close();
197 }
198 } catch (IOException e) {
199 log.error("", e);
200 }
201 }
202 return sentenceModel;
203 }
204
205
206 private TokenizerModel getTokenizerModel
207 () {
208 if (tokenizerModel == null) {
209 try {
210 InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream("org/nlp2rdf/implementation/opennlp/en-token.bin");
211 try {
212 tokenizerModel = new TokenizerModel(modelIn);
213 } finally {
214 modelIn.close();
215 }
216 } catch (IOException e) {
217 log.error("", e);
218 }
219 }
220
221 return tokenizerModel;
222 }
223
224 private POSModel getPOSModel
225 () {
226 if (posmodel == null) {
227 try {
228 InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream("org/nlp2rdf/implementation/opennlp/en-pos-maxent.bin");
229 try {
230 posmodel = new POSModel(modelIn);
231 } finally {
232 modelIn.close();
233 }
234 } catch (IOException e) {
235 log.error("", e);
236 }
237 }
238
239 return posmodel;
240 }
241
242
243 }