001 package org.nlp2rdf.annotator;
002
003 import edu.stanford.nlp.ling.CoreAnnotations;
004 import edu.stanford.nlp.ling.CoreLabel;
005 import edu.stanford.nlp.pipeline.Annotation;
006 import edu.stanford.nlp.util.CoreMap;
007 import opennlp.tools.util.Span;
008 import org.nlp2rdf.core.Tokenizer;
009
010 import java.util.List;
011
012 /**
013 * @author Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
014 */
015 public class StanfordTokenizer implements Tokenizer {
016
017 private final Annotation annotatedDocument;
018 private final List<CoreMap> sentences ;
019
020 /*public StanfordTokenizer() {
021 Properties props = new Properties();
022 props.put("annotators", "tokenize, ssplit");
023 pipeline = new StanfordCoreNLP(props);
024
025 }*/
026
027 public StanfordTokenizer(Annotation annotatedDocument) {
028 this.annotatedDocument = annotatedDocument;
029
030 // these are all the sentences in this document
031 // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
032 sentences = annotatedDocument.get(CoreAnnotations.SentencesAnnotation.class);
033 }
034
035 public Span[] detectSentences(String text) {
036
037 for (CoreMap sentence : sentences)
038
039 {
040 }
041
042 return null;
043 }
044
045 public Span[] detectWords(String sentence) {
046
047 for (CoreMap s : sentences)
048
049 {
050 // traversing the words in the current sentence
051 // a CoreLabel is a CoreMap with additional token-specific methods
052 for (CoreLabel token : s.get(CoreAnnotations.TokensAnnotation.class)) {
053 // this is the text of the token
054 String word = token.get(CoreAnnotations.TextAnnotation.class);
055 }
056
057 }
058
059 return null;
060 }
061
062 }