001    /***************************************************************************/
002    /*  Copyright (C) 2010-2011, Sebastian Hellmann                            */
003    /*  Note: If you need parts of NLP2RDF in another licence due to licence   */
004    /*  incompatibility, please mail hellmann@informatik.uni-leipzig.de        */
005    /*                                                                         */
006    /*  This file is part of NLP2RDF.                                          */
007    /*                                                                         */
008    /*  NLP2RDF is free software; you can redistribute it and/or modify        */
009    /*  it under the terms of the GNU General Public License as published by   */
010    /*  the Free Software Foundation; either version 3 of the License, or      */
011    /*  (at your option) any later version.                                    */
012    /*                                                                         */
013    /*  NLP2RDF is distributed in the hope that it will be useful,             */
014    /*  but WITHOUT ANY WARRANTY; without even the implied warranty of         */
015    /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the           */
016    /*  GNU General Public License for more details.                           */
017    /*                                                                         */
018    /*  You should have received a copy of the GNU General Public License      */
019    /*  along with this program. If not, see <http://www.gnu.org/licenses/>.   */
020    /***************************************************************************/
021    
022    package org.nlp2rdf.implementation.opennlp;
023    
024    import com.hp.hpl.jena.ontology.OntModel;
025    import com.hp.hpl.jena.ontology.OntModelSpec;
026    import com.hp.hpl.jena.rdf.model.ModelFactory;
027    import eu.lod2.nlp2rdf.schema.Thing;
028    import eu.lod2.nlp2rdf.schema.sso.Sentence;
029    import eu.lod2.nlp2rdf.schema.sso.Word;
030    import eu.lod2.nlp2rdf.schema.str.Document;
031    import eu.lod2.nlp2rdf.schema.tools.Factory;
032    import opennlp.tools.postag.POSModel;
033    import opennlp.tools.postag.POSTaggerME;
034    import org.nlp2rdf.core.Span;
035    import org.nlp2rdf.core.Text2RDF;
036    import org.nlp2rdf.core.URIGenerator;
037    import org.nlp2rdf.core.util.URIComparator;
038    import org.nlp2rdf.core.util.URIGeneratorHelper;
039    import org.nlp2rdf.ontology.olia.OLiAManager;
040    import org.nlp2rdf.ontology.olia.OLiAOntology;
041    import org.slf4j.Logger;
042    import org.slf4j.LoggerFactory;
043    
044    import java.io.*;
045    import java.util.Collections;
046    import java.util.List;
047    import java.util.TreeMap;
048    
049    public class OpenNLPWrapper {
050    
051        private static Logger log = LoggerFactory.getLogger(OpenNLPWrapper.class);
052    
053    
054        private static POSModel posmodel = null;
055    
056        private final POSTaggerME posTaggerME;
057        private final OLiAOntology penn;
058        private final OpenNLPTokenizer openNLPTokenizer;
059    
060        static {
061            /***************************
062             * Important requirement...
063             */
064            Factory.registerCustomClasses();
065        }
066    
067        public OpenNLPWrapper(OLiAManager oLiAManager) {
068            penn = oLiAManager.getOLiAOntology("http://purl.org/olia/penn-link.rdf");
069            posTaggerME = new POSTaggerME(getPOSModel());
070            openNLPTokenizer = new OpenNLPTokenizer();
071        }
072    
073        public static void main(String[] args) throws Exception {
074            try {
075                File input = null;
076                String outfile = null;
077                String format = null;
078                String urirecipe = null;
079                String prefix = null;
080    
081                // Read and validate command line arguments
082                boolean validArgs = false;
083                if (args.length == 3) {
084                    input = new File(args[1]);
085                    urirecipe = args[2];
086                    prefix = args[3];
087                    validArgs = input.exists() && !input.isDirectory();
088                    validArgs = validArgs && (urirecipe.equalsIgnoreCase("offset") || urirecipe.equalsIgnoreCase("context-hash"));
089                }
090                if (!validArgs) {
091                    printUsageMessage();
092                } else {
093                    if (!input.isDirectory()) {
094                        OntModel ontModel = ModelFactory.createOntologyModel(OntModelSpec.OWL_DL_MEM, ModelFactory.createDefaultModel());
095                        OpenNLPWrapper wrap = new OpenNLPWrapper(new OLiAManager());
096                        log.info("Processing file " + input);
097                        String document = wrap.readFileAsString(input);
098                        document = document.trim();
099                        //TODO parameter contextLength instead of 10
100                        URIGenerator uriGenerator = URIGeneratorHelper.determineGenerator(urirecipe, 10);
101                        wrap.processText(prefix, document, uriGenerator, ontModel);
102                    }
103                }
104            } catch (Exception e) {
105                e.printStackTrace();
106            }
107        }
108    
109    
110        /**
111         * Prints usage message.
112         */
113        private static void printUsageMessage() {
114            System.err.println("Usage: java org.nlp2rdf.wrapper.opennlp.OpenNLPWrapper "
115                    + "<input file> <output filename> <urirecipe> <prefix>");
116        }
117    
118        private String readFileAsString(File source) throws IOException {
119            final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(source)));
120            final byte[] buffer = new byte[(int) source.length()];
121            dis.readFully(buffer);
122            dis.close();
123            return new String(buffer);
124        }
125    
126        public void processText(String prefix, String text, URIGenerator urigenerator, OntModel model) {
127    
128            TreeMap<Span, List<Span>> tokenizedText = openNLPTokenizer.tokenizeText(text);
129    
130            //URIGenerator urigenerator = URIGeneratorHelper.initURIGenerator(text, tokenizedText, urirecipe);
131    
132    
133            Text2RDF text2RDF = new Text2RDF();
134            Document document = text2RDF.createDocumentAnnotation(prefix, text, urigenerator, model);
135            text2RDF.generateNIFModel(prefix, text, tokenizedText, urigenerator, document, model);
136            assignPosTags(prefix, text, urigenerator, model);
137    
138            //add additional data
139            text2RDF.addNextAndPreviousProperties(prefix, text, urigenerator, model);
140            //here OLiA classes are added
141            text2RDF.addCopyOfOLiAClassesAndHierarchy(penn, model);
142        }
143    
144    
145        public void processNIFModel(String prefix, String text, String urirecipe, OntModel model) {
146    
147            //Text2RDF text2RDF = new Text2RDF();
148            //URIGenerator uriGenerator = URIGeneratorHelper.determineGenerator(urirecipe);
149            //text2RDF.getTokenization(prefix,text, )
150            //URIGenerator urigenerator = URIGeneratorHelper.initURIGenerator(text, tokenizedText, urirecipe);
151    
152        }
153    
154        public void assignPosTags(String prefix, String text, URIGenerator uriGenerator, OntModel model) {
155            List<Sentence> sentences = Sentence.list(model);
156            for (int x = 0; x < sentences.size(); x++) {
157                Sentence sentence = sentences.get(x);
158                List<Word> words = sentence.listWord();
159                Collections.sort(sentences, new URIComparator(prefix, text, uriGenerator));
160                String[] tokens = new String[words.size()];
161                for (int i = 0; i < words.size(); i++) {
162                    tokens[i] = words.get(i).getAnchorOf();
163                }
164                String postags[] = posTaggerME.tag(tokens);
165                for (int i = 0; i < words.size(); i++) {
166                    words.get(i).addPosTag(postags[i]);
167                    String oliaIndividual = null;
168                    if ((oliaIndividual = penn.getIndividualURIForTag(postags[i])) != null) {
169                        words.get(i).addOliaLink(Thing.create(oliaIndividual, model));
170                    }
171                }
172            }
173        }
174    
175        private POSModel getPOSModel() {
176            if (posmodel == null) {
177                try {
178                    InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream(OpenNLPTokenizer.RESOURCEPATH + "en-pos-maxent.bin");
179                    try {
180                        posmodel = new POSModel(modelIn);
181                    } finally {
182                        modelIn.close();
183                    }
184                } catch (IOException e) {
185                    log.error("", e);
186                }
187            }
188    
189            return posmodel;
190        }
191    
192    
193    }