001    /***************************************************************************/
002    /*  Copyright (C) 2010-2011, Sebastian Hellmann                            */
003    /*  Note: If you need parts of NLP2RDF in another licence due to licence   */
004    /*  incompatibility, please mail hellmann@informatik.uni-leipzig.de        */
005    /*                                                                         */
006    /*  This file is part of NLP2RDF.                                          */
007    /*                                                                         */
008    /*  NLP2RDF is free software; you can redistribute it and/or modify        */
009    /*  it under the terms of the GNU General Public License as published by   */
010    /*  the Free Software Foundation; either version 3 of the License, or      */
011    /*  (at your option) any later version.                                    */
012    /*                                                                         */
013    /*  NLP2RDF is distributed in the hope that it will be useful,             */
014    /*  but WITHOUT ANY WARRANTY; without even the implied warranty of         */
015    /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the           */
016    /*  GNU General Public License for more details.                           */
017    /*                                                                         */
018    /*  You should have received a copy of the GNU General Public License      */
019    /*  along with this program. If not, see <http://www.gnu.org/licenses/>.   */
020    /***************************************************************************/
021    
022    package org.nlp2rdf.core;
023    
024    import com.hp.hpl.jena.ontology.OntClass;
025    import com.hp.hpl.jena.ontology.OntModel;
026    import com.hp.hpl.jena.ontology.OntModelSpec;
027    import com.hp.hpl.jena.rdf.model.ModelFactory;
028    import com.hp.hpl.jena.util.iterator.ExtendedIterator;
029    import com.jamonapi.Monitor;
030    import com.jamonapi.MonitorFactory;
031    import eu.lod2.nlp2rdf.schema.sso.Phrase;
032    import eu.lod2.nlp2rdf.schema.sso.Sentence;
033    import eu.lod2.nlp2rdf.schema.sso.Word;
034    import eu.lod2.nlp2rdf.schema.str.ContextHashBasedString;
035    import eu.lod2.nlp2rdf.schema.str.Document;
036    import eu.lod2.nlp2rdf.schema.str.IString;
037    import eu.lod2.nlp2rdf.schema.str.OffsetBasedString;
038    import eu.lod2.nlp2rdf.schema.tools.Factory;
039    import org.mindswap.pellet.Individual;
040    import org.nlp2rdf.core.impl.MD5Based;
041    import org.nlp2rdf.core.util.URIComparator;
042    import org.nlp2rdf.core.util.URIGeneratorHelper;
043    import org.nlp2rdf.ontology.olia.OLiAOntology;
044    import org.slf4j.Logger;
045    import org.slf4j.LoggerFactory;
046    
047    import java.lang.reflect.Method;
048    import java.security.InvalidParameterException;
049    import java.util.*;
050    
051    /**
052     * @author Sebastian Hellmann
053     */
054    public class Text2RDF {
055        private static Logger log = LoggerFactory.getLogger(Text2RDF.class);
056    
057        public static final String stringOntologyUrl = "http://nlp2rdf.lod2.eu/schema/string/";
058        public static final String structuredSentenceOntologyUrl = "http://nlp2rdf.lod2.eu/schema/sso/";
059    
060        static {
061            Factory.registerCustomClasses();
062        }
063    
064        /**
065         * @param prefix
066         * @param text
067         * @param uriGenerator
068         * @param model
069         * @return
070         */
071        public Document createDocumentAnnotation(String prefix, String text, URIGenerator uriGenerator, OntModel model) {
072            //make the uri and add the class for the URI recipe
073            String uri = uriGenerator.makeUri(prefix, text, new Span(0, text.length()), model);
074            uriGenerator.assignRecipeClass(uri, model);
075            //wrap it in a document
076            Document d = Document.create(uri, model);
077            d.setSourceString(text);
078            return d;
079        }
080    
081        /**
082         * This is a convenience function, which does quite a lot:
083         * 1. generates the uri
084         * 2. add the recipe class, i.e. OffsetBased or ContexthashBased
085         * 3. adds the class which is given in class (must mbe from owl2java)
086         * 4. adds the anchorOf annotation
087         * Note: all changes are also reflected in model
088         * Note: if something goes wrong this method catches all exceptions, logs it and then re throws it as a runtime exception
089         *
090         * @param cl
091         * @param prefix
092         * @param text         the whole (reference) text
093         * @param span         the span for the annotation
094         * @param uriGenerator
095         * @param model
096         * @param <S>
097         * @return the Jena Individual with the Type give in cl
098         */
099        public <S> S createStringAnnotationForClass(Class<S> cl, String prefix, String text, Span span, URIGenerator uriGenerator, OntModel model) {
100            Monitor mon = MonitorFactory.getTimeMonitor("createStringAnnotationForClass");
101            mon.start();
102            try {
103    
104                //1. make the uri and add the class for the URI recipe
105                String uri = uriGenerator.makeUri(prefix, text, span, model);
106                //2.assign class
107                uriGenerator.assignRecipeClass(uri, model);
108                Class[] argTypes = new Class[]{String.class, OntModel.class};
109                Method create = cl.getDeclaredMethod("create", argTypes);
110    
111                Object s = create.invoke(null, uri, model);
112    
113                String addressedString = (span.getCoveredText(text).toString());
114                if (s instanceof Sentence) {
115                    ((Sentence) s).setAnchorOf(addressedString);
116                } else if (s instanceof Phrase) {
117                    ((Phrase) s).setAnchorOf(addressedString);
118                } else if (s instanceof Word) {
119                    ((Word) s).setAnchorOf(addressedString);
120                } else if (s instanceof OffsetBasedString) {
121                    ((OffsetBasedString) s).setAnchorOf(addressedString);
122                } else if (s instanceof ContextHashBasedString) {
123                    ((ContextHashBasedString) s).setAnchorOf(addressedString);
124                } else {
125                    String message = "Class was not Word, Phrase or Sentence";
126                    log.error(message);
127                    throw new InvalidParameterException(message);
128                }
129                if (log.isTraceEnabled()) {
130                    log.trace("Added " + cl.getSimpleName() + " for " + uri);
131                }
132                return (S) s;
133            } catch (Exception e) {
134                log.error(e.getMessage(), e);
135                throw new RuntimeException(e.getMessage(), e);
136            } finally {
137                mon.stop();
138            }
139        }
140    
141    
142        /**
143         * adds sso:Sentence and sso:Word to it
144         * adds the word property and additionally, though optional, the firstWord and lastWord property
145         * <p/>
146         * Note that the expected tokenizer matches the opennlp tokenizer well,
147         * so instead of forcing a tokenizer to match the interface it might be smarter to rewrite this method.
148         *
149         * @param prefix
150         * @param text
151         * @param tokenizedText
152         * @param uriGenerator
153         * @param document      use null if you want to opt out
154         * @param model
155         * @return the model filled with NIF
156         */
157        public OntModel generateNIFModel(String prefix, String text, TreeMap<Span, List<Span>> tokenizedText, URIGenerator uriGenerator, Document document, OntModel model) {
158            assert tokenizedText != null && text != null && uriGenerator != null && prefix != null;
159            //some stats
160            Monitor mon = MonitorFactory.getTimeMonitor("generateBasicNIFModel").start();
161            int wordCount = 0;
162            try {
163                //set basic prefixes
164                model.setNsPrefix("sso", structuredSentenceOntologyUrl);
165                model.setNsPrefix("str", stringOntologyUrl);
166                for (Span sentenceSpan : tokenizedText.descendingKeySet()) {
167                    Sentence sentence = createStringAnnotationForClass(Sentence.class, prefix, text, sentenceSpan, uriGenerator, model);
168                    //assign str:substring to document
169                    if (document != null) {
170                        document.addSubString(sentence);
171                    }
172    
173    
174                    //detect words
175                    List<Span> wordSpans = new ArrayList<Span>(tokenizedText.get(sentenceSpan));
176                    wordCount += wordSpans.size();
177                    for (int i = 0; i < wordSpans.size(); i++) {
178                        Span wordSpan = wordSpans.get(i);
179                        // Span absoluteWordSpan = new Span(wordSpan, sentenceSpan.getStart());
180                        Word word = createStringAnnotationForClass(Word.class, prefix, text, wordSpan, uriGenerator, model);
181                        //add the firstWord property (optional and redundant to sso:word)
182                        if (i == 0) {
183                            sentence.setFirstWord(word);
184                        }
185                        //add the lasttWord property (optional and redundant to sso:word)
186                        if (i == (wordSpans.size() - 1)) {
187                            sentence.setLastWord(word);
188                        }
189                        //this is important for the str:subStringTrans inference
190                        sentence.addWord(word);
191    
192                        if (log.isTraceEnabled()) {
193                            StringBuilder logging = new StringBuilder();
194                            logging.append("\nword: " + wordSpan.getCoveredText(text));
195                            logging.append("\nabsolute sentence position [start|end]: " + sentenceSpan.getStart() + "|" + sentenceSpan.getEnd());
196                            logging.append("\nabsolute word position [start|end]: " + wordSpan.getStart() + "|" + wordSpan.getEnd());
197                            log.trace(logging.toString());
198                        }
199                    }
200                }
201                return model;
202            } finally {
203                mon.stop();
204                log.debug("Finished creating " + tokenizedText.size() + " sentence with " + wordCount + " words, " + mon.getLastValue() + " ms.) ");
205            }
206        }
207    
208    
209        /*public void addAdditionalProperties(String prefix, String text, TreeMap<Span, Span[]> sentencesAndWords, URIGenerator uriGenerator, OntModel m) {
210    
211            List<IString> sentences = new ArrayList<IString>(Sentence.list(m));
212            List<Span> spans = URIGeneratorHelper.getSpans(sentences, prefix, text, uriGenerator);
213            Collections.sort(spans, new Comparator<Span>() {
214                @Override
215                public int compare(Span span, Span span1) {
216                    return span.compareTo(span1);
217                }
218            });
219        } */
220    
221    
222        public TreeMap<Span, List<Span>> getTokenization(String prefix, String text, URIGenerator uriGenerator, OntModel model) {
223            TreeMap<Span, List<Span>> tokenizedText = new TreeMap<Span, List<Span>>();
224    
225            for (Sentence sentence : Sentence.list(model)) {
226                Span sentenceSpan = uriGenerator.getSpanFor(prefix, sentence.getURI(), text);
227                List<Span> wordSpans = new ArrayList<Span>();
228                for (Word word : sentence.listWord()) {
229                    Span wordSpan = uriGenerator.getSpanFor(prefix, word.getURI(), text);
230                    wordSpans.add(wordSpan);
231                }
232                tokenizedText.put(sentenceSpan, wordSpans);
233            }
234            return tokenizedText;
235        }
236    
237    
238        public void addNextAndPreviousProperties(String prefix, String text, URIGenerator uriGenerator, OntModel model) {
239            Monitor mon = MonitorFactory.getTimeMonitor("addNextAndPreviousProperties").start();
240            long previous = model.size();
241            List<Sentence> sentences = Sentence.list(model);
242            Collections.sort(sentences, new URIComparator(prefix, text, uriGenerator));
243            for (int x = 0; x < sentences.size(); x++) {
244                Sentence sentence = sentences.get(x);
245                List<Word> words = sentence.listWord();
246                Collections.sort(sentences, new URIComparator(prefix, text, uriGenerator));
247                if (x < sentences.size() - 1) {
248                    //not the last one
249                    sentence.setNextSentence(sentences.get(x + 1));
250                }
251    
252                for (int y = 0; y < words.size(); y++) {
253                    Word word = words.get(y);
254                    //not the last one
255                    if (y < words.size() - 1) {
256                        word.setNextWord(words.get(y + 1));
257                    }
258                }
259            }
260    
261            mon.stop();
262            log.debug("Finished addition of next/previous properties " + (model.size() - previous) + " triples added, " + mon.getLastValue() + " ms.)");
263        }
264    
265    
266        public void addCopyOfOLiAClassesAndHierarchy(OLiAOntology olia, OntModel model) {
267            Monitor mon = MonitorFactory.getTimeMonitor("addCopyOfOLiAClassesAndHierarchy").start();
268            long previous = model.size();
269            List<Word> words = Word.list(model);
270            for (Word w : words) {
271                List<String> posTags = w.listPosTag();
272                if (posTags.size() >= 1) {
273                    //get first ignore the others
274                    String posTag = posTags.get(0);
275                    //adding pos classes from olia and olia-top
276                    Set<String> classes = olia.getClassURIsForTag(posTag);
277                    for (String classUri : classes) {
278                        log.info("found: " + classUri + " for: " + posTag);
279                        OntModel hierarchy = olia.getHierarchy(classUri);
280                        for (ExtendedIterator<OntClass> it = hierarchy.listClasses(); it.hasNext(); ) {
281                            OntClass oc = it.next();
282                            //add the type
283                            w.addOntClass(model.createResource(oc.getURI()));
284                            //use all classes
285                            //if (oc.getURI().startsWith("http://purl.org/olia/olia-top.owl") || oc.getURI().startsWith("http://purl.org/olia/olia.owl")) {
286                            //}
287                        }
288                        //Copy the hierarchy
289                        model.add(hierarchy);
290                    }
291                }
292                if (posTags.size() > 1) {
293                    log.warn("several posTags " + posTags + " found for " + w.getURI());
294                }
295            }
296            mon.stop();
297            log.debug("Finished addition of OLiA Classes and Hierarchy " + (model.size() - previous) + " triples added, " + mon.getLastValue() + " ms.)");
298    
299            log.info("added ");
300        }
301    }