001 /***************************************************************************/
002 /* Copyright (C) 2010-2011, Sebastian Hellmann */
003 /* Note: If you need parts of NLP2RDF in another licence due to licence */
004 /* incompatibility, please mail hellmann@informatik.uni-leipzig.de */
005 /* */
006 /* This file is part of NLP2RDF. */
007 /* */
008 /* NLP2RDF is free software; you can redistribute it and/or modify */
009 /* it under the terms of the GNU General Public License as published by */
010 /* the Free Software Foundation; either version 3 of the License, or */
011 /* (at your option) any later version. */
012 /* */
013 /* NLP2RDF is distributed in the hope that it will be useful, */
014 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
015 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
016 /* GNU General Public License for more details. */
017 /* */
018 /* You should have received a copy of the GNU General Public License */
019 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
020 /***************************************************************************/
021
022 package org.nlp2rdf.implementation.opennlp;
023
024 import com.hp.hpl.jena.ontology.OntModel;
025 import com.hp.hpl.jena.ontology.OntModelSpec;
026 import com.hp.hpl.jena.rdf.model.ModelFactory;
027 import eu.lod2.nlp2rdf.schema.Thing;
028 import eu.lod2.nlp2rdf.schema.sso.Sentence;
029 import eu.lod2.nlp2rdf.schema.sso.Word;
030 import eu.lod2.nlp2rdf.schema.str.Document;
031 import eu.lod2.nlp2rdf.schema.tools.Factory;
032 import opennlp.tools.postag.POSModel;
033 import opennlp.tools.postag.POSTaggerME;
034 import org.nlp2rdf.core.Span;
035 import org.nlp2rdf.core.Text2RDF;
036 import org.nlp2rdf.core.URIGenerator;
037 import org.nlp2rdf.core.util.URIComparator;
038 import org.nlp2rdf.core.util.URIGeneratorHelper;
039 import org.nlp2rdf.ontology.olia.OLiAManager;
040 import org.nlp2rdf.ontology.olia.OLiAOntology;
041 import org.slf4j.Logger;
042 import org.slf4j.LoggerFactory;
043
044 import java.io.*;
045 import java.util.Collections;
046 import java.util.List;
047 import java.util.TreeMap;
048
049 public class OpenNLPWrapper {
050
051 private static Logger log = LoggerFactory.getLogger(OpenNLPWrapper.class);
052
053
054 private static POSModel posmodel = null;
055
056 private final POSTaggerME posTaggerME;
057 private final OLiAOntology penn;
058 private final OpenNLPTokenizer openNLPTokenizer;
059
060 static {
061 /***************************
062 * Important requirement...
063 */
064 Factory.registerCustomClasses();
065 }
066
067 public OpenNLPWrapper(OLiAManager oLiAManager) {
068 penn = oLiAManager.getOLiAOntology("http://purl.org/olia/penn-link.rdf");
069 posTaggerME = new POSTaggerME(getPOSModel());
070 openNLPTokenizer = new OpenNLPTokenizer();
071 }
072
073 public static void main(String[] args) throws Exception {
074 try {
075 File input = null;
076 String outfile = null;
077 String format = null;
078 String urirecipe = null;
079 String prefix = null;
080
081 // Read and validate command line arguments
082 boolean validArgs = false;
083 if (args.length == 3) {
084 input = new File(args[1]);
085 urirecipe = args[2];
086 prefix = args[3];
087 validArgs = input.exists() && !input.isDirectory();
088 validArgs = validArgs && (urirecipe.equalsIgnoreCase("offset") || urirecipe.equalsIgnoreCase("context-hash"));
089 }
090 if (!validArgs) {
091 printUsageMessage();
092 } else {
093 if (!input.isDirectory()) {
094 OntModel ontModel = ModelFactory.createOntologyModel(OntModelSpec.OWL_DL_MEM, ModelFactory.createDefaultModel());
095 OpenNLPWrapper wrap = new OpenNLPWrapper(new OLiAManager());
096 log.info("Processing file " + input);
097 String document = wrap.readFileAsString(input);
098 document = document.trim();
099 //TODO parameter contextLength instead of 10
100 URIGenerator uriGenerator = URIGeneratorHelper.determineGenerator(urirecipe, 10);
101 wrap.processText(prefix, document, uriGenerator, ontModel);
102 }
103 }
104 } catch (Exception e) {
105 e.printStackTrace();
106 }
107 }
108
109
110 /**
111 * Prints usage message.
112 */
113 private static void printUsageMessage() {
114 System.err.println("Usage: java org.nlp2rdf.wrapper.opennlp.OpenNLPWrapper "
115 + "<input file> <output filename> <urirecipe> <prefix>");
116 }
117
118 private String readFileAsString(File source) throws IOException {
119 final DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(source)));
120 final byte[] buffer = new byte[(int) source.length()];
121 dis.readFully(buffer);
122 dis.close();
123 return new String(buffer);
124 }
125
126 public void processText(String prefix, String text, URIGenerator urigenerator, OntModel model) {
127
128 TreeMap<Span, List<Span>> tokenizedText = openNLPTokenizer.tokenizeText(text);
129
130 //URIGenerator urigenerator = URIGeneratorHelper.initURIGenerator(text, tokenizedText, urirecipe);
131
132
133 Text2RDF text2RDF = new Text2RDF();
134 Document document = text2RDF.createDocumentAnnotation(prefix, text, urigenerator, model);
135 text2RDF.generateNIFModel(prefix, text, tokenizedText, urigenerator, document, model);
136 assignPosTags(prefix, text, urigenerator, model);
137
138 //add additional data
139 text2RDF.addNextAndPreviousProperties(prefix, text, urigenerator, model);
140 //here OLiA classes are added
141 text2RDF.addCopyOfOLiAClassesAndHierarchy(penn, model);
142 }
143
144
145 public void processNIFModel(String prefix, String text, String urirecipe, OntModel model) {
146
147 //Text2RDF text2RDF = new Text2RDF();
148 //URIGenerator uriGenerator = URIGeneratorHelper.determineGenerator(urirecipe);
149 //text2RDF.getTokenization(prefix,text, )
150 //URIGenerator urigenerator = URIGeneratorHelper.initURIGenerator(text, tokenizedText, urirecipe);
151
152 }
153
154 public void assignPosTags(String prefix, String text, URIGenerator uriGenerator, OntModel model) {
155 List<Sentence> sentences = Sentence.list(model);
156 for (int x = 0; x < sentences.size(); x++) {
157 Sentence sentence = sentences.get(x);
158 List<Word> words = sentence.listWord();
159 Collections.sort(sentences, new URIComparator(prefix, text, uriGenerator));
160 String[] tokens = new String[words.size()];
161 for (int i = 0; i < words.size(); i++) {
162 tokens[i] = words.get(i).getAnchorOf();
163 }
164 String postags[] = posTaggerME.tag(tokens);
165 for (int i = 0; i < words.size(); i++) {
166 words.get(i).addPosTag(postags[i]);
167 String oliaIndividual = null;
168 if ((oliaIndividual = penn.getIndividualURIForTag(postags[i])) != null) {
169 words.get(i).addOliaLink(Thing.create(oliaIndividual, model));
170 }
171 }
172 }
173 }
174
175 private POSModel getPOSModel() {
176 if (posmodel == null) {
177 try {
178 InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream(OpenNLPTokenizer.RESOURCEPATH + "en-pos-maxent.bin");
179 try {
180 posmodel = new POSModel(modelIn);
181 } finally {
182 modelIn.close();
183 }
184 } catch (IOException e) {
185 log.error("", e);
186 }
187 }
188
189 return posmodel;
190 }
191
192
193 }