001 /***************************************************************************/
002 /* Copyright (C) 2010-2011, Sebastian Hellmann */
003 /* Note: If you need parts of NLP2RDF in another licence due to licence */
004 /* incompatibility, please mail hellmann@informatik.uni-leipzig.de */
005 /* */
006 /* This file is part of NLP2RDF. */
007 /* */
008 /* NLP2RDF is free software; you can redistribute it and/or modify */
009 /* it under the terms of the GNU General Public License as published by */
010 /* the Free Software Foundation; either version 3 of the License, or */
011 /* (at your option) any later version. */
012 /* */
013 /* NLP2RDF is distributed in the hope that it will be useful, */
014 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
015 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
016 /* GNU General Public License for more details. */
017 /* */
018 /* You should have received a copy of the GNU General Public License */
019 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
020 /***************************************************************************/
021
022 package org.nlp2rdf.core;
023
024 import com.hp.hpl.jena.ontology.OntClass;
025 import com.hp.hpl.jena.ontology.OntModel;
026 import com.hp.hpl.jena.ontology.OntModelSpec;
027 import com.hp.hpl.jena.rdf.model.ModelFactory;
028 import com.hp.hpl.jena.util.iterator.ExtendedIterator;
029 import com.jamonapi.Monitor;
030 import com.jamonapi.MonitorFactory;
031 import eu.lod2.nlp2rdf.schema.sso.Phrase;
032 import eu.lod2.nlp2rdf.schema.sso.Sentence;
033 import eu.lod2.nlp2rdf.schema.sso.Word;
034 import eu.lod2.nlp2rdf.schema.str.ContextHashBasedString;
035 import eu.lod2.nlp2rdf.schema.str.Document;
036 import eu.lod2.nlp2rdf.schema.str.IString;
037 import eu.lod2.nlp2rdf.schema.str.OffsetBasedString;
038 import eu.lod2.nlp2rdf.schema.tools.Factory;
039 import org.mindswap.pellet.Individual;
040 import org.nlp2rdf.core.impl.MD5Based;
041 import org.nlp2rdf.core.util.URIComparator;
042 import org.nlp2rdf.core.util.URIGeneratorHelper;
043 import org.nlp2rdf.ontology.olia.OLiAOntology;
044 import org.slf4j.Logger;
045 import org.slf4j.LoggerFactory;
046
047 import java.lang.reflect.Method;
048 import java.security.InvalidParameterException;
049 import java.util.*;
050
051 /**
052 * @author Sebastian Hellmann
053 */
054 public class Text2RDF {
055 private static Logger log = LoggerFactory.getLogger(Text2RDF.class);
056
057 public static final String stringOntologyUrl = "http://nlp2rdf.lod2.eu/schema/string/";
058 public static final String structuredSentenceOntologyUrl = "http://nlp2rdf.lod2.eu/schema/sso/";
059
060 static {
061 Factory.registerCustomClasses();
062 }
063
064 /**
065 * @param prefix
066 * @param text
067 * @param uriGenerator
068 * @param model
069 * @return
070 */
071 public Document createDocumentAnnotation(String prefix, String text, URIGenerator uriGenerator, OntModel model) {
072 //make the uri and add the class for the URI recipe
073 String uri = uriGenerator.makeUri(prefix, text, new Span(0, text.length()), model);
074 uriGenerator.assignRecipeClass(uri, model);
075 //wrap it in a document
076 Document d = Document.create(uri, model);
077 d.setSourceString(text);
078 return d;
079 }
080
081 /**
082 * This is a convenience function, which does quite a lot:
083 * 1. generates the uri
084 * 2. add the recipe class, i.e. OffsetBased or ContexthashBased
085 * 3. adds the class which is given in class (must mbe from owl2java)
086 * 4. adds the anchorOf annotation
087 * Note: all changes are also reflected in model
088 * Note: if something goes wrong this method catches all exceptions, logs it and then re throws it as a runtime exception
089 *
090 * @param cl
091 * @param prefix
092 * @param text the whole (reference) text
093 * @param span the span for the annotation
094 * @param uriGenerator
095 * @param model
096 * @param <S>
097 * @return the Jena Individual with the Type give in cl
098 */
099 public <S> S createStringAnnotationForClass(Class<S> cl, String prefix, String text, Span span, URIGenerator uriGenerator, OntModel model) {
100 Monitor mon = MonitorFactory.getTimeMonitor("createStringAnnotationForClass");
101 mon.start();
102 try {
103
104 //1. make the uri and add the class for the URI recipe
105 String uri = uriGenerator.makeUri(prefix, text, span, model);
106 //2.assign class
107 uriGenerator.assignRecipeClass(uri, model);
108 Class[] argTypes = new Class[]{String.class, OntModel.class};
109 Method create = cl.getDeclaredMethod("create", argTypes);
110
111 Object s = create.invoke(null, uri, model);
112
113 String addressedString = (span.getCoveredText(text).toString());
114 if (s instanceof Sentence) {
115 ((Sentence) s).setAnchorOf(addressedString);
116 } else if (s instanceof Phrase) {
117 ((Phrase) s).setAnchorOf(addressedString);
118 } else if (s instanceof Word) {
119 ((Word) s).setAnchorOf(addressedString);
120 } else if (s instanceof OffsetBasedString) {
121 ((OffsetBasedString) s).setAnchorOf(addressedString);
122 } else if (s instanceof ContextHashBasedString) {
123 ((ContextHashBasedString) s).setAnchorOf(addressedString);
124 } else {
125 String message = "Class was not Word, Phrase or Sentence";
126 log.error(message);
127 throw new InvalidParameterException(message);
128 }
129 if (log.isTraceEnabled()) {
130 log.trace("Added " + cl.getSimpleName() + " for " + uri);
131 }
132 return (S) s;
133 } catch (Exception e) {
134 log.error(e.getMessage(), e);
135 throw new RuntimeException(e.getMessage(), e);
136 } finally {
137 mon.stop();
138 }
139 }
140
141
142 /**
143 * adds sso:Sentence and sso:Word to it
144 * adds the word property and additionally, though optional, the firstWord and lastWord property
145 * <p/>
146 * Note that the expected tokenizer matches the opennlp tokenizer well,
147 * so instead of forcing a tokenizer to match the interface it might be smarter to rewrite this method.
148 *
149 * @param prefix
150 * @param text
151 * @param tokenizedText
152 * @param uriGenerator
153 * @param document use null if you want to opt out
154 * @param model
155 * @return the model filled with NIF
156 */
157 public OntModel generateNIFModel(String prefix, String text, TreeMap<Span, List<Span>> tokenizedText, URIGenerator uriGenerator, Document document, OntModel model) {
158 assert tokenizedText != null && text != null && uriGenerator != null && prefix != null;
159 //some stats
160 Monitor mon = MonitorFactory.getTimeMonitor("generateBasicNIFModel").start();
161 int wordCount = 0;
162 try {
163 //set basic prefixes
164 model.setNsPrefix("sso", structuredSentenceOntologyUrl);
165 model.setNsPrefix("str", stringOntologyUrl);
166 for (Span sentenceSpan : tokenizedText.descendingKeySet()) {
167 Sentence sentence = createStringAnnotationForClass(Sentence.class, prefix, text, sentenceSpan, uriGenerator, model);
168 //assign str:substring to document
169 if (document != null) {
170 document.addSubString(sentence);
171 }
172
173
174 //detect words
175 List<Span> wordSpans = new ArrayList<Span>(tokenizedText.get(sentenceSpan));
176 wordCount += wordSpans.size();
177 for (int i = 0; i < wordSpans.size(); i++) {
178 Span wordSpan = wordSpans.get(i);
179 // Span absoluteWordSpan = new Span(wordSpan, sentenceSpan.getStart());
180 Word word = createStringAnnotationForClass(Word.class, prefix, text, wordSpan, uriGenerator, model);
181 //add the firstWord property (optional and redundant to sso:word)
182 if (i == 0) {
183 sentence.setFirstWord(word);
184 }
185 //add the lasttWord property (optional and redundant to sso:word)
186 if (i == (wordSpans.size() - 1)) {
187 sentence.setLastWord(word);
188 }
189 //this is important for the str:subStringTrans inference
190 sentence.addWord(word);
191
192 if (log.isTraceEnabled()) {
193 StringBuilder logging = new StringBuilder();
194 logging.append("\nword: " + wordSpan.getCoveredText(text));
195 logging.append("\nabsolute sentence position [start|end]: " + sentenceSpan.getStart() + "|" + sentenceSpan.getEnd());
196 logging.append("\nabsolute word position [start|end]: " + wordSpan.getStart() + "|" + wordSpan.getEnd());
197 log.trace(logging.toString());
198 }
199 }
200 }
201 return model;
202 } finally {
203 mon.stop();
204 log.debug("Finished creating " + tokenizedText.size() + " sentence with " + wordCount + " words, " + mon.getLastValue() + " ms.) ");
205 }
206 }
207
208
209 /*public void addAdditionalProperties(String prefix, String text, TreeMap<Span, Span[]> sentencesAndWords, URIGenerator uriGenerator, OntModel m) {
210
211 List<IString> sentences = new ArrayList<IString>(Sentence.list(m));
212 List<Span> spans = URIGeneratorHelper.getSpans(sentences, prefix, text, uriGenerator);
213 Collections.sort(spans, new Comparator<Span>() {
214 @Override
215 public int compare(Span span, Span span1) {
216 return span.compareTo(span1);
217 }
218 });
219 } */
220
221
222 public TreeMap<Span, List<Span>> getTokenization(String prefix, String text, URIGenerator uriGenerator, OntModel model) {
223 TreeMap<Span, List<Span>> tokenizedText = new TreeMap<Span, List<Span>>();
224
225 for (Sentence sentence : Sentence.list(model)) {
226 Span sentenceSpan = uriGenerator.getSpanFor(prefix, sentence.getURI(), text);
227 List<Span> wordSpans = new ArrayList<Span>();
228 for (Word word : sentence.listWord()) {
229 Span wordSpan = uriGenerator.getSpanFor(prefix, word.getURI(), text);
230 wordSpans.add(wordSpan);
231 }
232 tokenizedText.put(sentenceSpan, wordSpans);
233 }
234 return tokenizedText;
235 }
236
237
238 public void addNextAndPreviousProperties(String prefix, String text, URIGenerator uriGenerator, OntModel model) {
239 Monitor mon = MonitorFactory.getTimeMonitor("addNextAndPreviousProperties").start();
240 long previous = model.size();
241 List<Sentence> sentences = Sentence.list(model);
242 Collections.sort(sentences, new URIComparator(prefix, text, uriGenerator));
243 for (int x = 0; x < sentences.size(); x++) {
244 Sentence sentence = sentences.get(x);
245 List<Word> words = sentence.listWord();
246 Collections.sort(sentences, new URIComparator(prefix, text, uriGenerator));
247 if (x < sentences.size() - 1) {
248 //not the last one
249 sentence.setNextSentence(sentences.get(x + 1));
250 }
251
252 for (int y = 0; y < words.size(); y++) {
253 Word word = words.get(y);
254 //not the last one
255 if (y < words.size() - 1) {
256 word.setNextWord(words.get(y + 1));
257 }
258 }
259 }
260
261 mon.stop();
262 log.debug("Finished addition of next/previous properties " + (model.size() - previous) + " triples added, " + mon.getLastValue() + " ms.)");
263 }
264
265
266 public void addCopyOfOLiAClassesAndHierarchy(OLiAOntology olia, OntModel model) {
267 Monitor mon = MonitorFactory.getTimeMonitor("addCopyOfOLiAClassesAndHierarchy").start();
268 long previous = model.size();
269 List<Word> words = Word.list(model);
270 for (Word w : words) {
271 List<String> posTags = w.listPosTag();
272 if (posTags.size() >= 1) {
273 //get first ignore the others
274 String posTag = posTags.get(0);
275 //adding pos classes from olia and olia-top
276 Set<String> classes = olia.getClassURIsForTag(posTag);
277 for (String classUri : classes) {
278 log.info("found: " + classUri + " for: " + posTag);
279 OntModel hierarchy = olia.getHierarchy(classUri);
280 for (ExtendedIterator<OntClass> it = hierarchy.listClasses(); it.hasNext(); ) {
281 OntClass oc = it.next();
282 //add the type
283 w.addOntClass(model.createResource(oc.getURI()));
284 //use all classes
285 //if (oc.getURI().startsWith("http://purl.org/olia/olia-top.owl") || oc.getURI().startsWith("http://purl.org/olia/olia.owl")) {
286 //}
287 }
288 //Copy the hierarchy
289 model.add(hierarchy);
290 }
291 }
292 if (posTags.size() > 1) {
293 log.warn("several posTags " + posTags + " found for " + w.getURI());
294 }
295 }
296 mon.stop();
297 log.debug("Finished addition of OLiA Classes and Hierarchy " + (model.size() - previous) + " triples added, " + mon.getLastValue() + " ms.)");
298
299 log.info("added ");
300 }
301 }