001    package org.nlp2rdf.annotator;
002    
003    import com.hp.hpl.jena.ontology.OntClass;
004    import com.hp.hpl.jena.ontology.OntModel;
005    import com.hp.hpl.jena.ontology.OntModelSpec;
006    import com.hp.hpl.jena.rdf.model.ModelFactory;
007    import com.hp.hpl.jena.util.iterator.ExtendedIterator;
008    import edu.stanford.nlp.ling.CoreAnnotations.*;
009    import edu.stanford.nlp.ling.CoreLabel;
010    import edu.stanford.nlp.pipeline.Annotation;
011    import edu.stanford.nlp.pipeline.StanfordCoreNLP;
012    import edu.stanford.nlp.util.CoreMap;
013    import opennlp.tools.util.Span;
014    import org.nlp2rdf.core.SentencePOJO;
015    import org.nlp2rdf.core.Text2RDF;
016    import org.nlp2rdf.core.URIGenerator;
017    import org.nlp2rdf.core.WordPOJO;
018    import org.nlp2rdf.core.impl.OffsetBased;
019    import org.nlp2rdf.ontology.olia.OLiAManager;
020    import org.nlp2rdf.ontology.olia.OLiAOntology;
021    import org.slf4j.Logger;
022    import org.slf4j.LoggerFactory;
023    
024    import java.util.ArrayList;
025    import java.util.List;
026    import java.util.Properties;
027    import java.util.Set;
028    
029    /**
030     * The basic code was taken from the ClearTK Project
031     * http://code.google.com/p/cleartk
032     * who have written a UIMA wrapper.
033     * The original file by Steven Bethard can be found here:
034     * http://code.google.com/p/cleartk/source/browse/trunk/cleartk-stanford-corenlp/src/main/java/org/cleartk/stanford/StanfordCoreNLPAnnotator.java
035     * Licence http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
036     *
037     * @author Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
038     */
039    
040    public class StanfordCoreNLPAnnotator {
041        private static Logger log = LoggerFactory.getLogger(StanfordCoreNLPAnnotator.class);
042    
043        private final OLiAManager oLiAManager;
044        private final OLiAOntology penn;
045    
046        public StanfordCoreNLPAnnotator(OLiAManager oLiAManager) {
047            this.oLiAManager = oLiAManager;
048            penn = oLiAManager.getOLiAOntology("http://purl.oclc.org/olia/penn-link.rdf");
049        }
050    
051        public static void main(String[] args) {
052            OntModel m = ModelFactory.createOntologyModel(OntModelSpec.OWL_DL_MEM, ModelFactory.createDefaultModel());
053            new StanfordCoreNLPAnnotator(new OLiAManager()).process("http://test/test/", "This is a sentence. ", m);
054            System.out.println(m);
055        }
056    
057        public void process(String prefix, String text, OntModel diff) {
058            // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
059            Properties props = new Properties();
060            //props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
061            props.put("annotators", "tokenize, ssplit, pos, lemma");//, lemma, ner, parse, dcoref");
062            StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
063    
064            // create an empty Annotation just with the given text
065            Annotation document = new Annotation(text);
066    
067            // run all Annotators on this text
068            pipeline.annotate(document);
069    
070            // these are all the sentences in this document
071            // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
072            List<CoreMap> sentences = document.get(SentencesAnnotation.class);
073    
074            List<StanfordSentencePOJO> sentencePOJOs = new ArrayList<StanfordSentencePOJO>();
075    
076            for (CoreMap sentence : sentences) {
077    
078                // add the sentence annotation
079                StanfordSentencePOJO sentencePOJO = new StanfordSentencePOJO();
080                Span sspan = new Span(sentence.get(CharacterOffsetBeginAnnotation.class), sentence.get(CharacterOffsetEndAnnotation.class));
081                sentencePOJO.setText(sspan.getCoveredText(text).toString());
082                log.info(sspan.getCoveredText(text).toString());
083                sentencePOJO.setSpan(sspan);
084                List<WordPOJO> wordPojos = new ArrayList<WordPOJO>();
085                sentencePOJO.setWordPOJOs(wordPojos);
086                sentencePOJOs.add(sentencePOJO);
087    
088                // traversing the words in the current sentence
089                // a CoreLabel is a CoreMap with additional token-specific methods
090                for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
091                    StanfordWordPOJO wordPOJO = new StanfordWordPOJO();
092                    wordPojos.add(wordPOJO);
093                    Span tmp = new Span(token.get(CharacterOffsetBeginAnnotation.class), token.get(CharacterOffsetEndAnnotation.class));
094                    //Span wspan = new Span(tmp, sspan.getStart());
095                    Span wspan = tmp;
096                    wordPOJO.setSpan(wspan);
097                    wordPOJO.setText(wspan.getCoveredText(text).toString());
098                    log.info(wordPOJO.getText());
099                    // this is the POS tag of the token
100                    wordPOJO.setPos(token.get(PartOfSpeechAnnotation.class));
101                    // this is the NER label of the token
102                    wordPOJO.setNe(token.get(NamedEntityTagAnnotation.class));
103                    wordPOJO.setLemma(token.get(LemmaAnnotation.class));
104    
105                }
106    
107                //TODO this is the parse tree of the current sentence
108                //Tree tree = sentence.get(TreeAnnotation.class);
109    
110                //TODO this is the Stanford dependency graph of the current sentence
111                //SemanticGraph dependencies = sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);
112            }
113    
114            ///
115            /// Make NIF-OWL
116            ///
117            diff.setNsPrefix("sso", Text2RDF.structuredSentenceOntologyUrl);
118            diff.setNsPrefix("string", Text2RDF.stringOntologyUrl);
119            URIGenerator offsetBased = new OffsetBased();
120            offsetBased.init(text, SentencePOJO.getSpans(sentencePOJOs));
121    
122            //the values uri in SentencePOJO or WordPOJO are set now
123            new Text2RDF().sentencePOJOs2OWL(prefix, text, sentencePOJOs, diff, offsetBased);
124    
125            long size = diff.size();
126            for (StanfordSentencePOJO s : sentencePOJOs) {
127                for (WordPOJO w : s.getWordPOJOs()) {
128                    StanfordWordPOJO sw = (StanfordWordPOJO) w;
129                    //adding lemmas
130                    sw.getWord().addLemma(sw.getLemma());
131    
132                    String posTag = sw.getPos();
133                    //adding the plain old string annotation
134                    sw.getWord().addPosTag(posTag);
135    
136                    //adding pos classes from olia and olia-top
137                    Set<String> classes = penn.getClassURIsForTag(posTag);
138                    for (String classUri : classes) {
139                        log.info("found: " + classUri + " for: " + posTag);
140                        OntModel hierarchy = penn.getHierarchy(classUri);
141                        for (ExtendedIterator<OntClass> it = hierarchy.listClasses(); it.hasNext(); ) {
142                            OntClass oc = it.next();
143                            if (oc.getURI().startsWith("http://purl.oclc.org/olia/olia-top.owl") || oc.getURI().startsWith("http://purl.oclc.org/olia/olia.owl")) {
144                                sw.getWord().addOntClass(diff.createResource(oc.getURI()));
145                            }
146                        }
147                    }
148                }
149            }
150            log.info("Added lemma, pos, olia having " + (diff.size() - size) + " more triples.");
151            // this is the coreference link graph
152            // each link stores an arc in the graph; the first element in the Pair is the source, the second is the target
153            // each node is stored as <sentence id, token id>. Both offsets start at 1!
154            //List<Pair<IntTuple, IntTuple>> graph = document.get(CorefCoreAnnotations.CorefGraphAnnotation.class);
155        }
156    
157        //subclassing the POJOs
158    
159        public class StanfordSentencePOJO extends SentencePOJO {
160    
161        }
162    
163        public class StanfordWordPOJO extends WordPOJO {
164            private String pos;
165            private String ne;
166            private String lemma;
167    
168            public String getLemma() {
169                return lemma;
170            }
171    
172            public void setLemma(String lemma) {
173                this.lemma = lemma;
174            }
175    
176            public String getPos() {
177                return pos;
178            }
179    
180            public void setPos(String pos) {
181                this.pos = pos;
182            }
183    
184            public String getNe() {
185                return ne;
186            }
187    
188            public void setNe(String ne) {
189                this.ne = ne;
190            }
191        }
192    
193    }
194    /* @Override
195       public void process(String text) {
196           Annotation document = this.processor.process(text);
197    
198           String lastNETag = "O";
199           int lastNEBegin = -1;
200           int lastNEEnd = -1;
201           for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) {
202    
203               // create the token annotation
204               int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class);
205               int end = tokenAnn.get(CharacterOffsetEndAnnotation.class);
206               String pos = tokenAnn.get(PartOfSpeechAnnotation.class);
207               String lemma = tokenAnn.get(LemmaAnnotation.class);
208    
209                Token token = new Token(jCas, begin, end);
210                 token.setPos(pos);
211                 token.setLemma(lemma);
212                 token.addToIndexes();
213               // hackery to convert token-level named entity tag into phrase-level tag
214               String neTag = tokenAnn.get(NamedEntityTagAnnotation.class);
215               if (neTag.equals("O") && !lastNETag.equals("O")) {
216                  NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
217                  ne.setMentionType(lastNETag);
218                  ne.addToIndexes();
219    
220               } else {
221                   if (lastNETag.equals("O")) {
222                       lastNEBegin = begin;
223                   } else if (lastNETag.equals(neTag)) {
224                       // do nothing - begin was already set
225                   } else {
226                       NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
227                       ne.setMentionType(lastNETag);
228                       ne.addToIndexes();
229                       lastNEBegin = begin;
230                   }
231                   lastNEEnd = end;
232               }
233               lastNETag = neTag;
234           }
235           if (!lastNETag.equals("O")) {
236               NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
237               ne.setMentionType(lastNETag);
238               ne.addToIndexes();
239           }
240    
241           // add sentences and trees
242           for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) {
243    
244               // add the sentence annotation
245               int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class);
246               int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class);
247               Sentence sentence = new Sentence(jCas, sentBegin, sentEnd);
248               sentence.addToIndexes();
249    
250               // add the syntactic tree annotation
251               List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class);
252               Tree tree = sentenceAnn.get(TreeAnnotation.class);
253               if (tree.children().length != 1) {
254                   throw new RuntimeException("Expected single root node, found " + tree);
255               }
256               tree = tree.firstChild();
257               tree.indexSpans(0);
258               TopTreebankNode root = new TopTreebankNode(jCas);
259               root.setTreebankParse(tree.toString());
260               // TODO: root.setTerminals(v)
261               this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns);
262    
263               // get the dependencies
264               SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class);
265    
266               // convert Stanford nodes to UIMA annotations
267               List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
268               Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>();
269               for (IndexedWord stanfordNode : dependencies.vertexSet()) {
270                   int indexBegin = stanfordNode.get(BeginIndexAnnotation.class);
271                   int indexEnd = stanfordNode.get(EndIndexAnnotation.class);
272                   int tokenBegin = tokens.get(indexBegin).getBegin();
273                   int tokenEnd = tokens.get(indexEnd - 1).getEnd();
274                   DependencyNode node;
275                   if (dependencies.getRoots().contains(stanfordNode)) {
276                       node = new TopDependencyNode(jCas, tokenBegin, tokenEnd);
277                   } else {
278                       node = new DependencyNode(jCas, tokenBegin, tokenEnd);
279                   }
280                   stanfordToUima.put(stanfordNode, node);
281               }
282    
283               // create relation annotations for each Stanford dependency
284               ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create();
285               ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create();
286               for (SemanticGraphEdge stanfordEdge : dependencies.edgeList()) {
287                   DependencyRelation relation = new DependencyRelation(jCas);
288                   DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor());
289                   DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent());
290                   String relationType = stanfordEdge.getRelation().toString();
291                   if (head == null || child == null || relationType == null) {
292                       throw new RuntimeException(String.format("null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head));
293                   }
294                   relation.setHead(head);
295                   relation.setChild(child);
296                   relation.setRelation(relationType);
297                   relation.addToIndexes();
298                   headRelations.put(child, relation);
299                   childRelations.put(head, relation);
300               }
301    
302               // set the relations for each node annotation
303               for (DependencyNode node : stanfordToUima.values()) {
304                   node.setHeadRelations(UIMAUtil.toFSArray(jCas, headRelations.get(node)));
305                   node.setChildRelations(UIMAUtil.toFSArray(jCas, childRelations.get(node)));
306                   node.addToIndexes();
307               }
308           }
309    
310           // map from tokens to their smallest containing named entity mentions
311           Map<Span, NamedEntityMention> tokenMentionMap = new HashMap<Span, NamedEntityMention>();
312           for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
313               for (Token token : JCasUtil.selectCovered(jCas, Token.class, mention)) {
314                   Span span = new Span(token.getBegin(), token.getEnd());
315                   NamedEntityMention oldMention = tokenMentionMap.get(span);
316                   if (oldMention == null || AnnotationUtil.size(mention) < AnnotationUtil.size(oldMention)) {
317                       tokenMentionMap.put(span, mention);
318                   }
319               }
320           }
321    
322           // add mentions for all entities identified by the coreference system
323           CorefGraph corefGraph = new CorefGraph(document.get(CorefGraphAnnotation.class));
324           Map<CoreMap, NamedEntityMention> stanfordToUimaNE = new HashMap<CoreMap, NamedEntityMention>();
325           for (CoreMap tokenMap : corefGraph.getMentions(document)) {
326               NamedEntityMention mention = null;
327    
328               // figure out the character span of the token
329               int begin = tokenMap.get(CharacterOffsetBeginAnnotation.class);
330               int end = tokenMap.get(CharacterOffsetEndAnnotation.class);
331    
332               // if a named entity already contains the token, use that
333               mention = tokenMentionMap.get(new Span(begin, end));
334    
335               // otherwise, create a new named entity mention
336               if (mention == null) {
337                   Token token = new Token(jCas, begin, end);
338                   for (TreebankNode node : JCasUtil.selectCovered(jCas, TreebankNode.class, token)) {
339                       // if the token is a PRP, use that
340                       if (node.getNodeType().startsWith("PRP")) {
341                           begin = node.getBegin();
342                           end = node.getEnd();
343                           break;
344                       }
345                       // if the token's parent is an NP, use that
346                       TreebankNode parent = node.getParent();
347                       if (node.getLeaf() && parent != null && parent.getNodeType().equals("NP")) {
348                           begin = parent.getBegin();
349                           end = parent.getEnd();
350                           break;
351                       }
352                   }
353                   // create the named entity mention (defaulting to the same span as the token)
354                   mention = new NamedEntityMention(jCas, begin, end);
355                   mention.addToIndexes();
356               }
357    
358               // update the token -> mention mapping
359               stanfordToUimaNE.put(tokenMap, mention);
360           }
361    
362           // link mentions into their entities
363           List<NamedEntity> entities = new ArrayList<NamedEntity>();
364           for (Set<CoreMap> tokenMaps : corefGraph.getEntities(document)) {
365    
366               // sort mentions by document order
367               List<CoreMap> tokenMapsList = new ArrayList<CoreMap>(tokenMaps);
368               Collections.sort(tokenMapsList, new Comparator<CoreMap>() {
369                   @Override
370                   public int compare(CoreMap o1, CoreMap o2) {
371                       int begin1 = o1.get(CharacterOffsetBeginAnnotation.class);
372                       int begin2 = o2.get(CharacterOffsetBeginAnnotation.class);
373                       return begin1 - begin2;
374                   }
375               });
376    
377               // create mentions and add them to entity
378               NamedEntity entity = new NamedEntity(jCas);
379               entity.setMentions(new FSArray(jCas, tokenMapsList.size()));
380               int index = 0;
381               for (CoreMap tokenMap : tokenMapsList) {
382                   NamedEntityMention mention = stanfordToUimaNE.get(tokenMap);
383                   mention.setMentionedEntity(entity);
384                   entity.setMentions(index, mention);
385                   index += 1;
386               }
387               entities.add(entity);
388           }
389    
390           // add singleton entities for any named entities not picked up by coreference system
391           for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
392               if (mention.getMentionedEntity() == null) {
393                   NamedEntity entity = new NamedEntity(jCas);
394                   entity.setMentions(new FSArray(jCas, 1));
395                   entity.setMentions(0, mention);
396                   mention.setMentionedEntity(entity);
397                   entity.getMentions();
398                   entities.add(entity);
399               }
400           }
401    
402           // sort entities by document order
403           Collections.sort(entities, new Comparator<NamedEntity>() {
404               @Override
405               public int compare(NamedEntity o1, NamedEntity o2) {
406                   return getFirstBegin(o1) - getFirstBegin(o2);
407               }
408    
409               private int getFirstBegin(NamedEntity entity) {
410                   int min = Integer.MAX_VALUE;
411                   for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) {
412                       if (mention.getBegin() < min) {
413                           min = mention.getBegin();
414                       }
415                   }
416                   return min;
417               }
418           });
419    
420           // add entities to document
421           for (NamedEntity entity : entities) {
422               entity.addToIndexes();
423           }
424    
425       }
426    
427       private FSArray addTreebankNodeChildrenToIndexes(TreebankNode parent, JCas jCas, List<CoreLabel> tokenAnns, Tree tree) {
428           Tree[] childTrees = tree.children();
429    
430           // collect all children (except leaves, which are just the words - POS tags are pre-terminals in
431           // a Stanford tree)
432           List<TreebankNode> childNodes = new ArrayList<TreebankNode>();
433           for (Tree child : childTrees) {
434               if (!child.isLeaf()) {
435    
436                   // set node attributes and add children (mutual recursion)
437                   TreebankNode node = new TreebankNode(jCas);
438                   node.setParent(parent);
439                   this.addTreebankNodeToIndexes(node, jCas, child, tokenAnns);
440                   childNodes.add(node);
441               }
442           }
443    
444           // convert the child list into an FSArray
445           FSArray childNodeArray = new FSArray(jCas, childNodes.size());
446           for (int i = 0; i < childNodes.size(); ++i) {
447               childNodeArray.set(i, childNodes.get(i));
448           }
449           return childNodeArray;
450       }
451    
452       private void addTreebankNodeToIndexes(TreebankNode node, JCas jCas, Tree tree, List<CoreLabel> tokenAnns) {
453           // figure out begin and end character offsets
454           CoreMap label = (CoreMap) tree.label();
455           CoreMap beginToken = tokenAnns.get(label.get(BeginIndexAnnotation.class));
456           CoreMap endToken = tokenAnns.get(label.get(EndIndexAnnotation.class) - 1);
457           int nodeBegin = beginToken.get(CharacterOffsetBeginAnnotation.class);
458           int nodeEnd = endToken.get(CharacterOffsetEndAnnotation.class);
459    
460           // set span, node type, children (mutual recursion), and add it to the JCas
461           node.setBegin(nodeBegin);
462           node.setEnd(nodeEnd);
463           node.setNodeType(tree.value());
464           node.setChildren(this.addTreebankNodeChildrenToIndexes(node, jCas, tokenAnns, tree));
465           node.setLeaf(node.getChildren().size() == 0);
466           node.addToIndexes();
467       }
468    
469       private static class Span {
470           public int begin;
471    
472           public int end;
473    
474           public Span(int begin, int end) {
475               this.begin = begin;
476               this.end = end;
477           }
478    
479           public boolean equals(Object object) {
480               if (object instanceof Span) {
481                   Span that = (Span) object;
482                   return this.begin == that.begin && this.end == that.end;
483               } else {
484                   return false;
485               }
486           }
487    
488           public int hashCode() {
489               return Arrays.hashCode(new int[]{this.begin, this.end});
490           }
491       }
492    }
493    */
494    
495