001    package org.nlp2rdf.implementation.opennlp;
002    
003    import com.hp.hpl.jena.ontology.OntClass;
004    import com.hp.hpl.jena.ontology.OntModel;
005    import com.hp.hpl.jena.ontology.OntModelSpec;
006    import com.hp.hpl.jena.rdf.model.ModelFactory;
007    import com.hp.hpl.jena.util.iterator.ExtendedIterator;
008    import eu.lod2.nlp2rdf.schema.Thing;
009    import eu.lod2.nlp2rdf.schema.sso.Word;
010    import eu.lod2.nlp2rdf.schema.str.Document;
011    import eu.lod2.nlp2rdf.schema.tools.Factory;
012    import opennlp.tools.postag.POSModel;
013    import opennlp.tools.postag.POSTaggerME;
014    import opennlp.tools.sentdetect.SentenceDetectorME;
015    import opennlp.tools.sentdetect.SentenceModel;
016    import opennlp.tools.tokenize.TokenizerME;
017    import opennlp.tools.tokenize.TokenizerModel;
018    import opennlp.tools.util.Span;
019    import org.nlp2rdf.core.Text2RDF;
020    import org.nlp2rdf.core.URIGenerator;
021    import org.nlp2rdf.core.util.URIGeneratorHelper;
022    import org.nlp2rdf.ontology.olia.OLiAManager;
023    import org.nlp2rdf.ontology.olia.OLiAOntology;
024    import org.slf4j.Logger;
025    import org.slf4j.LoggerFactory;
026    
027    import java.io.*;
028    import java.util.HashSet;
029    import java.util.List;
030    import java.util.Set;
031    
032    public class OpenNLPWrapper {
033    
034        private final OLiAOntology penn;
035        private static Logger log = LoggerFactory.getLogger(OpenNLPWrapper.class);
036    
037        //the model is threadsafe according to the javadoc
038        private static TokenizerModel tokenizerModel = null;
039        private static SentenceModel sentenceModel = null;
040        private static POSModel posmodel = null;
041    
042        private TokenizerME tokenizerME = null;
043        private SentenceDetectorME sentenceDetectorME = null;
044        private POSTaggerME posTaggerME = null;
045    
046        static {
047            /***************************
048             * Important requirement...
049             */
050            Factory.registerCustomClasses();
051        }
052    
053        public OpenNLPWrapper(OLiAManager oLiAManager) {
054            penn = oLiAManager.getOLiAOntology("http://purl.org/olia/penn-link.rdf");
055            tokenizerME = new TokenizerME(getTokenizerModel());
056            sentenceDetectorME = new SentenceDetectorME(getSentenceModel());
057            posTaggerME = new POSTaggerME(getPOSModel());
058        }
059        public static void main(String[] args) throws Exception {
060            try {
061                File input = null;
062                String outfile = null;
063                String format = null;
064                String urirecipe = null;
065                String prefix = null;
066    
067                // Read and validate command line arguments
068                boolean validArgs = false;
069                if (args.length == 3) {
070                    input = new File(args[1]);
071                    urirecipe = args[2];
072                    prefix = args[3];
073                    validArgs = input.exists() && !input.isDirectory();
074                    validArgs = validArgs && (urirecipe.equalsIgnoreCase("offset") || urirecipe.equalsIgnoreCase("context-hash"));
075                }
076                if (!validArgs) {
077                    printUsageMessage();
078                } else {
079                    if (!input.isDirectory()) {
080                        OntModel m = ModelFactory.createOntologyModel(OntModelSpec.OWL_DL_MEM, ModelFactory.createDefaultModel());
081                        OpenNLPWrapper wrap = new OpenNLPWrapper(new OLiAManager());
082                        log.info("Processing file " + input);
083                        String document = wrap.readFileAsString(input);
084                        document = document.trim();
085                        wrap.processText(prefix, document, urirecipe, m);
086                    }
087                }
088            } catch (Exception e) {
089                e.printStackTrace();
090            }
091        }
092    
093    
094        /**
095         * Prints usage message.
096         */
097        private static void printUsageMessage() {
098            System.err.println("Usage: java org.nlp2rdf.wrapper.opennlp.OpenNLPWrapper "
099                    + "<input file> <output filename> <urirecipe> <prefix>");
100        }
101    
102        private String readFileAsString(File source) throws IOException {
103            final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(source)));
104            final byte[] buffer = new byte[(int) source.length()];
105            dis.readFully(buffer);
106            dis.close();
107            return new String(buffer);
108        }
109    
110        public void processText(String prefix, String document, String urirecipe, OntModel model) throws IOException {
111    
112            Span sentences[] = sentenceDetectorME.sentPosDetect(document);
113            Span tokens[] = null;
114            URIGenerator urigenerator = URIGeneratorHelper.determineGenerator(urirecipe);
115            Set<Span> spans = new HashSet<Span>();
116            //document span
117            spans.add(new Span(0, document.length()));
118            for (Span senspan : sentences) {
119    //                      // add the sentence annotation
120                spans.add(senspan);
121                tokens = tokenizerME.tokenizePos(senspan.getCoveredText(document).toString());
122                for (Span token : tokens) {
123                    spans.add(token);
124                }
125            }
126            urigenerator.init(document, spans);
127            model.setNsPrefix("sso", Text2RDF.structuredSentenceOntologyUrl);
128            model.setNsPrefix("string", Text2RDF.stringOntologyUrl);
129    
130            Document doc = null;
131            if (Document.list(model).isEmpty()) {
132                doc = new Text2RDF().createDocumentAnnotation(prefix, document,
133                        urigenerator, model);
134            } else {
135                List<Document> ld = Document.list(model);
136                if (ld.size() == 1) {
137                    doc = ld.get(0);
138                } else {
139                    //TODO handle more than one
140                    log.error("no document found or too many!");
141                }
142    
143            }
144            for (Span senspan : sentences) {
145    //                      // add the sentence annotation
146                spans.add(senspan);
147                eu.lod2.nlp2rdf.schema.sso.Sentence s = eu.lod2.nlp2rdf.schema.sso.Sentence.create(urigenerator.makeUri(prefix, document, senspan), model);
148                s.setAnchorOf(senspan.getCoveredText(document).toString());
149                //add the class for the URI recipe
150                urigenerator.assignRecipeClass(s.getURI(), model);
151    
152                //add substring annotation
153                if (doc != null && !(doc.getURI().equals(s.getURI()))) {
154                    doc.addSubString(s);
155                }
156                tokens = tokenizerME.tokenizePos(senspan.getCoveredText(document).toString());
157                String postags[] = posTaggerME.tag(tokenizerME.tokenize(senspan.getCoveredText(document).toString()));
158                for (int i = 0; i < tokens.length; i++) {
159                    Word w = Word.create(urigenerator.makeUri(prefix, document, tokens[i]), model);
160                    //add the class for the URI recipe
161                    urigenerator.assignRecipeClass(w.getURI(), model);
162                    w.setAnchorOf(tokens[i].getCoveredText(document).toString());
163                    w.addPosTag(postags[i]);
164                    String oliaIndividual = null;
165                    if ((oliaIndividual = penn.getIndividualURIForTag(postags[i])) != null) {
166                        w.addOliaLink(Thing.create(oliaIndividual, model));
167                    }
168    
169                    //adding pos classes from olia and olia-top
170                    Set<String> classes = penn.getClassURIsForTag(postags[i]);
171                    for (String classUri : classes) {
172                        log.info("found: " + classUri + " for: " + postags[i]);
173                        OntModel hierarchy = penn.getHierarchy(classUri);
174                        for (ExtendedIterator<OntClass> it = hierarchy.listClasses(); it.hasNext(); ) {
175                            OntClass oc = it.next();
176                            //use all classes
177                            //if (oc.getURI().startsWith("http://purl.org/olia/olia-top.owl") || oc.getURI().startsWith("http://purl.org/olia/olia.owl")) {
178                            w.addOntClass(model.createResource(oc.getURI()));
179                            //}
180                        }
181                        //Copy the hierarchy
182                        model.add(hierarchy);
183                    }
184                }
185            }
186        }
187    
188    
189        private SentenceModel getSentenceModel() {
190            if (sentenceModel == null) {
191                try {
192                    InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream("org/nlp2rdf/implementation/opennlp/en-sent.bin");
193                    try {
194                        sentenceModel = new SentenceModel(modelIn);
195                    } finally {
196                        modelIn.close();
197                    }
198                } catch (IOException e) {
199                    log.error("", e);
200                }
201            }
202            return sentenceModel;
203        }
204    
205    
206        private TokenizerModel getTokenizerModel
207                () {
208            if (tokenizerModel == null) {
209                try {
210                    InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream("org/nlp2rdf/implementation/opennlp/en-token.bin");
211                    try {
212                        tokenizerModel = new TokenizerModel(modelIn);
213                    } finally {
214                        modelIn.close();
215                    }
216                } catch (IOException e) {
217                    log.error("", e);
218                }
219            }
220    
221            return tokenizerModel;
222        }
223    
224        private POSModel getPOSModel
225                () {
226            if (posmodel == null) {
227                try {
228                    InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream("org/nlp2rdf/implementation/opennlp/en-pos-maxent.bin");
229                    try {
230                        posmodel = new POSModel(modelIn);
231                    } finally {
232                        modelIn.close();
233                    }
234                } catch (IOException e) {
235                    log.error("", e);
236                }
237            }
238    
239            return posmodel;
240        }
241    
242    
243    }