001 package org.nlp2rdf.annotator;
002
003 import com.hp.hpl.jena.ontology.OntClass;
004 import com.hp.hpl.jena.ontology.OntModel;
005 import com.hp.hpl.jena.ontology.OntModelSpec;
006 import com.hp.hpl.jena.rdf.model.ModelFactory;
007 import com.hp.hpl.jena.util.iterator.ExtendedIterator;
008 import edu.stanford.nlp.ling.CoreAnnotations.*;
009 import edu.stanford.nlp.ling.CoreLabel;
010 import edu.stanford.nlp.pipeline.Annotation;
011 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
012 import edu.stanford.nlp.util.CoreMap;
013 import opennlp.tools.util.Span;
014 import org.nlp2rdf.core.SentencePOJO;
015 import org.nlp2rdf.core.Text2RDF;
016 import org.nlp2rdf.core.URIGenerator;
017 import org.nlp2rdf.core.WordPOJO;
018 import org.nlp2rdf.core.impl.OffsetBased;
019 import org.nlp2rdf.ontology.olia.OLiAManager;
020 import org.nlp2rdf.ontology.olia.OLiAOntology;
021 import org.slf4j.Logger;
022 import org.slf4j.LoggerFactory;
023
024 import java.util.ArrayList;
025 import java.util.List;
026 import java.util.Properties;
027 import java.util.Set;
028
029 /**
030 * The basic code was taken from the ClearTK Project
031 * http://code.google.com/p/cleartk
032 * who have written a UIMA wrapper.
033 * The original file by Steven Bethard can be found here:
034 * http://code.google.com/p/cleartk/source/browse/trunk/cleartk-stanford-corenlp/src/main/java/org/cleartk/stanford/StanfordCoreNLPAnnotator.java
035 * Licence http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
036 *
037 * @author Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
038 */
039
040 public class StanfordCoreNLPAnnotator {
041 private static Logger log = LoggerFactory.getLogger(StanfordCoreNLPAnnotator.class);
042
043 private final OLiAManager oLiAManager;
044 private final OLiAOntology penn;
045
046 public StanfordCoreNLPAnnotator(OLiAManager oLiAManager) {
047 this.oLiAManager = oLiAManager;
048 penn = oLiAManager.getOLiAOntology("http://purl.oclc.org/olia/penn-link.rdf");
049 }
050
051 public static void main(String[] args) {
052 OntModel m = ModelFactory.createOntologyModel(OntModelSpec.OWL_DL_MEM, ModelFactory.createDefaultModel());
053 new StanfordCoreNLPAnnotator(new OLiAManager()).process("http://test/test/", "This is a sentence. ", m);
054 System.out.println(m);
055 }
056
057 public void process(String prefix, String text, OntModel diff) {
058 // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
059 Properties props = new Properties();
060 //props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
061 props.put("annotators", "tokenize, ssplit, pos, lemma");//, lemma, ner, parse, dcoref");
062 StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
063
064 // create an empty Annotation just with the given text
065 Annotation document = new Annotation(text);
066
067 // run all Annotators on this text
068 pipeline.annotate(document);
069
070 // these are all the sentences in this document
071 // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
072 List<CoreMap> sentences = document.get(SentencesAnnotation.class);
073
074 List<StanfordSentencePOJO> sentencePOJOs = new ArrayList<StanfordSentencePOJO>();
075
076 for (CoreMap sentence : sentences) {
077
078 // add the sentence annotation
079 StanfordSentencePOJO sentencePOJO = new StanfordSentencePOJO();
080 Span sspan = new Span(sentence.get(CharacterOffsetBeginAnnotation.class), sentence.get(CharacterOffsetEndAnnotation.class));
081 sentencePOJO.setText(sspan.getCoveredText(text).toString());
082 log.info(sspan.getCoveredText(text).toString());
083 sentencePOJO.setSpan(sspan);
084 List<WordPOJO> wordPojos = new ArrayList<WordPOJO>();
085 sentencePOJO.setWordPOJOs(wordPojos);
086 sentencePOJOs.add(sentencePOJO);
087
088 // traversing the words in the current sentence
089 // a CoreLabel is a CoreMap with additional token-specific methods
090 for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
091 StanfordWordPOJO wordPOJO = new StanfordWordPOJO();
092 wordPojos.add(wordPOJO);
093 Span tmp = new Span(token.get(CharacterOffsetBeginAnnotation.class), token.get(CharacterOffsetEndAnnotation.class));
094 //Span wspan = new Span(tmp, sspan.getStart());
095 Span wspan = tmp;
096 wordPOJO.setSpan(wspan);
097 wordPOJO.setText(wspan.getCoveredText(text).toString());
098 log.info(wordPOJO.getText());
099 // this is the POS tag of the token
100 wordPOJO.setPos(token.get(PartOfSpeechAnnotation.class));
101 // this is the NER label of the token
102 wordPOJO.setNe(token.get(NamedEntityTagAnnotation.class));
103 wordPOJO.setLemma(token.get(LemmaAnnotation.class));
104
105 }
106
107 //TODO this is the parse tree of the current sentence
108 //Tree tree = sentence.get(TreeAnnotation.class);
109
110 //TODO this is the Stanford dependency graph of the current sentence
111 //SemanticGraph dependencies = sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);
112 }
113
114 ///
115 /// Make NIF-OWL
116 ///
117 diff.setNsPrefix("sso", Text2RDF.structuredSentenceOntologyUrl);
118 diff.setNsPrefix("string", Text2RDF.stringOntologyUrl);
119 URIGenerator offsetBased = new OffsetBased();
120 offsetBased.init(text, SentencePOJO.getSpans(sentencePOJOs));
121
122 //the values uri in SentencePOJO or WordPOJO are set now
123 new Text2RDF().sentencePOJOs2OWL(prefix, text, sentencePOJOs, diff, offsetBased);
124
125 long size = diff.size();
126 for (StanfordSentencePOJO s : sentencePOJOs) {
127 for (WordPOJO w : s.getWordPOJOs()) {
128 StanfordWordPOJO sw = (StanfordWordPOJO) w;
129 //adding lemmas
130 sw.getWord().addLemma(sw.getLemma());
131
132 String posTag = sw.getPos();
133 //adding the plain old string annotation
134 sw.getWord().addPosTag(posTag);
135
136 //adding pos classes from olia and olia-top
137 Set<String> classes = penn.getClassURIsForTag(posTag);
138 for (String classUri : classes) {
139 log.info("found: " + classUri + " for: " + posTag);
140 OntModel hierarchy = penn.getHierarchy(classUri);
141 for (ExtendedIterator<OntClass> it = hierarchy.listClasses(); it.hasNext(); ) {
142 OntClass oc = it.next();
143 if (oc.getURI().startsWith("http://purl.oclc.org/olia/olia-top.owl") || oc.getURI().startsWith("http://purl.oclc.org/olia/olia.owl")) {
144 sw.getWord().addOntClass(diff.createResource(oc.getURI()));
145 }
146 }
147 }
148 }
149 }
150 log.info("Added lemma, pos, olia having " + (diff.size() - size) + " more triples.");
151 // this is the coreference link graph
152 // each link stores an arc in the graph; the first element in the Pair is the source, the second is the target
153 // each node is stored as <sentence id, token id>. Both offsets start at 1!
154 //List<Pair<IntTuple, IntTuple>> graph = document.get(CorefCoreAnnotations.CorefGraphAnnotation.class);
155 }
156
157 //subclassing the POJOs
158
159 public class StanfordSentencePOJO extends SentencePOJO {
160
161 }
162
163 public class StanfordWordPOJO extends WordPOJO {
164 private String pos;
165 private String ne;
166 private String lemma;
167
168 public String getLemma() {
169 return lemma;
170 }
171
172 public void setLemma(String lemma) {
173 this.lemma = lemma;
174 }
175
176 public String getPos() {
177 return pos;
178 }
179
180 public void setPos(String pos) {
181 this.pos = pos;
182 }
183
184 public String getNe() {
185 return ne;
186 }
187
188 public void setNe(String ne) {
189 this.ne = ne;
190 }
191 }
192
193 }
194 /* @Override
195 public void process(String text) {
196 Annotation document = this.processor.process(text);
197
198 String lastNETag = "O";
199 int lastNEBegin = -1;
200 int lastNEEnd = -1;
201 for (CoreMap tokenAnn : document.get(TokensAnnotation.class)) {
202
203 // create the token annotation
204 int begin = tokenAnn.get(CharacterOffsetBeginAnnotation.class);
205 int end = tokenAnn.get(CharacterOffsetEndAnnotation.class);
206 String pos = tokenAnn.get(PartOfSpeechAnnotation.class);
207 String lemma = tokenAnn.get(LemmaAnnotation.class);
208
209 Token token = new Token(jCas, begin, end);
210 token.setPos(pos);
211 token.setLemma(lemma);
212 token.addToIndexes();
213 // hackery to convert token-level named entity tag into phrase-level tag
214 String neTag = tokenAnn.get(NamedEntityTagAnnotation.class);
215 if (neTag.equals("O") && !lastNETag.equals("O")) {
216 NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
217 ne.setMentionType(lastNETag);
218 ne.addToIndexes();
219
220 } else {
221 if (lastNETag.equals("O")) {
222 lastNEBegin = begin;
223 } else if (lastNETag.equals(neTag)) {
224 // do nothing - begin was already set
225 } else {
226 NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
227 ne.setMentionType(lastNETag);
228 ne.addToIndexes();
229 lastNEBegin = begin;
230 }
231 lastNEEnd = end;
232 }
233 lastNETag = neTag;
234 }
235 if (!lastNETag.equals("O")) {
236 NamedEntityMention ne = new NamedEntityMention(jCas, lastNEBegin, lastNEEnd);
237 ne.setMentionType(lastNETag);
238 ne.addToIndexes();
239 }
240
241 // add sentences and trees
242 for (CoreMap sentenceAnn : document.get(SentencesAnnotation.class)) {
243
244 // add the sentence annotation
245 int sentBegin = sentenceAnn.get(CharacterOffsetBeginAnnotation.class);
246 int sentEnd = sentenceAnn.get(CharacterOffsetEndAnnotation.class);
247 Sentence sentence = new Sentence(jCas, sentBegin, sentEnd);
248 sentence.addToIndexes();
249
250 // add the syntactic tree annotation
251 List<CoreLabel> tokenAnns = sentenceAnn.get(TokensAnnotation.class);
252 Tree tree = sentenceAnn.get(TreeAnnotation.class);
253 if (tree.children().length != 1) {
254 throw new RuntimeException("Expected single root node, found " + tree);
255 }
256 tree = tree.firstChild();
257 tree.indexSpans(0);
258 TopTreebankNode root = new TopTreebankNode(jCas);
259 root.setTreebankParse(tree.toString());
260 // TODO: root.setTerminals(v)
261 this.addTreebankNodeToIndexes(root, jCas, tree, tokenAnns);
262
263 // get the dependencies
264 SemanticGraph dependencies = sentenceAnn.get(CollapsedCCProcessedDependenciesAnnotation.class);
265
266 // convert Stanford nodes to UIMA annotations
267 List<Token> tokens = JCasUtil.selectCovered(jCas, Token.class, sentence);
268 Map<IndexedWord, DependencyNode> stanfordToUima = new HashMap<IndexedWord, DependencyNode>();
269 for (IndexedWord stanfordNode : dependencies.vertexSet()) {
270 int indexBegin = stanfordNode.get(BeginIndexAnnotation.class);
271 int indexEnd = stanfordNode.get(EndIndexAnnotation.class);
272 int tokenBegin = tokens.get(indexBegin).getBegin();
273 int tokenEnd = tokens.get(indexEnd - 1).getEnd();
274 DependencyNode node;
275 if (dependencies.getRoots().contains(stanfordNode)) {
276 node = new TopDependencyNode(jCas, tokenBegin, tokenEnd);
277 } else {
278 node = new DependencyNode(jCas, tokenBegin, tokenEnd);
279 }
280 stanfordToUima.put(stanfordNode, node);
281 }
282
283 // create relation annotations for each Stanford dependency
284 ArrayListMultimap<DependencyNode, DependencyRelation> headRelations = ArrayListMultimap.create();
285 ArrayListMultimap<DependencyNode, DependencyRelation> childRelations = ArrayListMultimap.create();
286 for (SemanticGraphEdge stanfordEdge : dependencies.edgeList()) {
287 DependencyRelation relation = new DependencyRelation(jCas);
288 DependencyNode head = stanfordToUima.get(stanfordEdge.getGovernor());
289 DependencyNode child = stanfordToUima.get(stanfordEdge.getDependent());
290 String relationType = stanfordEdge.getRelation().toString();
291 if (head == null || child == null || relationType == null) {
292 throw new RuntimeException(String.format("null elements not allowed in relation:\nrelation=%s\nchild=%s\nhead=%s\n", relation, child, head));
293 }
294 relation.setHead(head);
295 relation.setChild(child);
296 relation.setRelation(relationType);
297 relation.addToIndexes();
298 headRelations.put(child, relation);
299 childRelations.put(head, relation);
300 }
301
302 // set the relations for each node annotation
303 for (DependencyNode node : stanfordToUima.values()) {
304 node.setHeadRelations(UIMAUtil.toFSArray(jCas, headRelations.get(node)));
305 node.setChildRelations(UIMAUtil.toFSArray(jCas, childRelations.get(node)));
306 node.addToIndexes();
307 }
308 }
309
310 // map from tokens to their smallest containing named entity mentions
311 Map<Span, NamedEntityMention> tokenMentionMap = new HashMap<Span, NamedEntityMention>();
312 for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
313 for (Token token : JCasUtil.selectCovered(jCas, Token.class, mention)) {
314 Span span = new Span(token.getBegin(), token.getEnd());
315 NamedEntityMention oldMention = tokenMentionMap.get(span);
316 if (oldMention == null || AnnotationUtil.size(mention) < AnnotationUtil.size(oldMention)) {
317 tokenMentionMap.put(span, mention);
318 }
319 }
320 }
321
322 // add mentions for all entities identified by the coreference system
323 CorefGraph corefGraph = new CorefGraph(document.get(CorefGraphAnnotation.class));
324 Map<CoreMap, NamedEntityMention> stanfordToUimaNE = new HashMap<CoreMap, NamedEntityMention>();
325 for (CoreMap tokenMap : corefGraph.getMentions(document)) {
326 NamedEntityMention mention = null;
327
328 // figure out the character span of the token
329 int begin = tokenMap.get(CharacterOffsetBeginAnnotation.class);
330 int end = tokenMap.get(CharacterOffsetEndAnnotation.class);
331
332 // if a named entity already contains the token, use that
333 mention = tokenMentionMap.get(new Span(begin, end));
334
335 // otherwise, create a new named entity mention
336 if (mention == null) {
337 Token token = new Token(jCas, begin, end);
338 for (TreebankNode node : JCasUtil.selectCovered(jCas, TreebankNode.class, token)) {
339 // if the token is a PRP, use that
340 if (node.getNodeType().startsWith("PRP")) {
341 begin = node.getBegin();
342 end = node.getEnd();
343 break;
344 }
345 // if the token's parent is an NP, use that
346 TreebankNode parent = node.getParent();
347 if (node.getLeaf() && parent != null && parent.getNodeType().equals("NP")) {
348 begin = parent.getBegin();
349 end = parent.getEnd();
350 break;
351 }
352 }
353 // create the named entity mention (defaulting to the same span as the token)
354 mention = new NamedEntityMention(jCas, begin, end);
355 mention.addToIndexes();
356 }
357
358 // update the token -> mention mapping
359 stanfordToUimaNE.put(tokenMap, mention);
360 }
361
362 // link mentions into their entities
363 List<NamedEntity> entities = new ArrayList<NamedEntity>();
364 for (Set<CoreMap> tokenMaps : corefGraph.getEntities(document)) {
365
366 // sort mentions by document order
367 List<CoreMap> tokenMapsList = new ArrayList<CoreMap>(tokenMaps);
368 Collections.sort(tokenMapsList, new Comparator<CoreMap>() {
369 @Override
370 public int compare(CoreMap o1, CoreMap o2) {
371 int begin1 = o1.get(CharacterOffsetBeginAnnotation.class);
372 int begin2 = o2.get(CharacterOffsetBeginAnnotation.class);
373 return begin1 - begin2;
374 }
375 });
376
377 // create mentions and add them to entity
378 NamedEntity entity = new NamedEntity(jCas);
379 entity.setMentions(new FSArray(jCas, tokenMapsList.size()));
380 int index = 0;
381 for (CoreMap tokenMap : tokenMapsList) {
382 NamedEntityMention mention = stanfordToUimaNE.get(tokenMap);
383 mention.setMentionedEntity(entity);
384 entity.setMentions(index, mention);
385 index += 1;
386 }
387 entities.add(entity);
388 }
389
390 // add singleton entities for any named entities not picked up by coreference system
391 for (NamedEntityMention mention : JCasUtil.select(jCas, NamedEntityMention.class)) {
392 if (mention.getMentionedEntity() == null) {
393 NamedEntity entity = new NamedEntity(jCas);
394 entity.setMentions(new FSArray(jCas, 1));
395 entity.setMentions(0, mention);
396 mention.setMentionedEntity(entity);
397 entity.getMentions();
398 entities.add(entity);
399 }
400 }
401
402 // sort entities by document order
403 Collections.sort(entities, new Comparator<NamedEntity>() {
404 @Override
405 public int compare(NamedEntity o1, NamedEntity o2) {
406 return getFirstBegin(o1) - getFirstBegin(o2);
407 }
408
409 private int getFirstBegin(NamedEntity entity) {
410 int min = Integer.MAX_VALUE;
411 for (NamedEntityMention mention : JCasUtil.select(entity.getMentions(), NamedEntityMention.class)) {
412 if (mention.getBegin() < min) {
413 min = mention.getBegin();
414 }
415 }
416 return min;
417 }
418 });
419
420 // add entities to document
421 for (NamedEntity entity : entities) {
422 entity.addToIndexes();
423 }
424
425 }
426
427 private FSArray addTreebankNodeChildrenToIndexes(TreebankNode parent, JCas jCas, List<CoreLabel> tokenAnns, Tree tree) {
428 Tree[] childTrees = tree.children();
429
430 // collect all children (except leaves, which are just the words - POS tags are pre-terminals in
431 // a Stanford tree)
432 List<TreebankNode> childNodes = new ArrayList<TreebankNode>();
433 for (Tree child : childTrees) {
434 if (!child.isLeaf()) {
435
436 // set node attributes and add children (mutual recursion)
437 TreebankNode node = new TreebankNode(jCas);
438 node.setParent(parent);
439 this.addTreebankNodeToIndexes(node, jCas, child, tokenAnns);
440 childNodes.add(node);
441 }
442 }
443
444 // convert the child list into an FSArray
445 FSArray childNodeArray = new FSArray(jCas, childNodes.size());
446 for (int i = 0; i < childNodes.size(); ++i) {
447 childNodeArray.set(i, childNodes.get(i));
448 }
449 return childNodeArray;
450 }
451
452 private void addTreebankNodeToIndexes(TreebankNode node, JCas jCas, Tree tree, List<CoreLabel> tokenAnns) {
453 // figure out begin and end character offsets
454 CoreMap label = (CoreMap) tree.label();
455 CoreMap beginToken = tokenAnns.get(label.get(BeginIndexAnnotation.class));
456 CoreMap endToken = tokenAnns.get(label.get(EndIndexAnnotation.class) - 1);
457 int nodeBegin = beginToken.get(CharacterOffsetBeginAnnotation.class);
458 int nodeEnd = endToken.get(CharacterOffsetEndAnnotation.class);
459
460 // set span, node type, children (mutual recursion), and add it to the JCas
461 node.setBegin(nodeBegin);
462 node.setEnd(nodeEnd);
463 node.setNodeType(tree.value());
464 node.setChildren(this.addTreebankNodeChildrenToIndexes(node, jCas, tokenAnns, tree));
465 node.setLeaf(node.getChildren().size() == 0);
466 node.addToIndexes();
467 }
468
469 private static class Span {
470 public int begin;
471
472 public int end;
473
474 public Span(int begin, int end) {
475 this.begin = begin;
476 this.end = end;
477 }
478
479 public boolean equals(Object object) {
480 if (object instanceof Span) {
481 Span that = (Span) object;
482 return this.begin == that.begin && this.end == that.end;
483 } else {
484 return false;
485 }
486 }
487
488 public int hashCode() {
489 return Arrays.hashCode(new int[]{this.begin, this.end});
490 }
491 }
492 }
493 */
494
495