001 /***************************************************************************/
002 /* Copyright (C) 2010-2011, Sebastian Hellmann */
003 /* Note: If you need parts of NLP2RDF in another licence due to licence */
004 /* incompatibility, please mail hellmann@informatik.uni-leipzig.de */
005 /* */
006 /* This file is part of NLP2RDF. */
007 /* */
008 /* NLP2RDF is free software; you can redistribute it and/or modify */
009 /* it under the terms of the GNU General Public License as published by */
010 /* the Free Software Foundation; either version 3 of the License, or */
011 /* (at your option) any later version. */
012 /* */
013 /* NLP2RDF is distributed in the hope that it will be useful, */
014 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
015 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
016 /* GNU General Public License for more details. */
017 /* */
018 /* You should have received a copy of the GNU General Public License */
019 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
020 /***************************************************************************/
021
022 package org.nlp2rdf.implementation.opennlp;
023
024 import opennlp.tools.sentdetect.SentenceDetectorME;
025 import opennlp.tools.sentdetect.SentenceModel;
026 import opennlp.tools.tokenize.TokenizerME;
027 import opennlp.tools.tokenize.TokenizerModel;
028 import org.nlp2rdf.core.Span;
029 import org.slf4j.Logger;
030 import org.slf4j.LoggerFactory;
031
032 import java.io.IOException;
033 import java.io.InputStream;
034 import java.lang.reflect.Array;
035 import java.util.ArrayList;
036 import java.util.Collection;
037 import java.util.List;
038 import java.util.TreeMap;
039
040 /**
041 */
042 public class OpenNLPTokenizer {
043 private static Logger log = LoggerFactory.getLogger(OpenNLPTokenizer.class);
044 public static final String RESOURCEPATH = "org/nlp2rdf/implementation/opennlp/";
045
046 //the model is threadsafe according to the javadoc
047 private static TokenizerModel tokenizerModel = null;
048 private static SentenceModel sentenceModel = null;
049
050 private TokenizerME tokenizerME = null;
051 private SentenceDetectorME sentenceDetectorME = null;
052
053 public OpenNLPTokenizer() {
054 tokenizerME = new TokenizerME(getTokenizerModel());
055 sentenceDetectorME = new SentenceDetectorME(getSentenceModel());
056 }
057
058 public TreeMap<Span, List<Span>> tokenizeText(String text) {
059 //get all the sentences and words
060 TreeMap<Span, List<Span>> tokenizedText = new TreeMap<Span, List<Span>>();
061 for (Span sentenceSpan : this.detectSentences(text)) {
062 List<Span> wordSpans = new ArrayList<Span>();
063 String sentenceText = sentenceSpan.getCoveredText(text).toString();
064 for (Span wordSpan : this.detectWords(sentenceText)) {
065 wordSpans.add(new Span(wordSpan, sentenceSpan.getStart()));
066 }
067 tokenizedText.put(sentenceSpan, wordSpans);
068 }
069 return tokenizedText;
070 }
071
072 public synchronized Span[] detectSentences(String text) {
073 opennlp.tools.util.Span[] spans = sentenceDetectorME.sentPosDetect(text);
074 SpanDecorator[] ret = new SpanDecorator[spans.length];
075 for (int i = 0; i < spans.length; i++) {
076 ret[i] = new SpanDecorator(spans[i]);
077 }
078 return ret;
079 }
080
081 public synchronized Span[] detectWords(String sentence) {
082 //the Tokenizer is not Thread-safe!
083 opennlp.tools.util.Span[] spans = tokenizerME.tokenizePos(sentence);
084 SpanDecorator[] ret = new SpanDecorator[spans.length];
085 for (int i = 0; i < spans.length; i++) {
086 ret[i] = new SpanDecorator(spans[i]);
087 }
088 return ret;
089 }
090
091
092 private SentenceModel getSentenceModel() {
093 if (sentenceModel == null) {
094 try {
095 InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream(RESOURCEPATH + "en-sent.bin");
096 try {
097 sentenceModel = new SentenceModel(modelIn);
098 } finally {
099 modelIn.close();
100 }
101 } catch (IOException e) {
102 log.error("", e);
103 }
104 }
105 return sentenceModel;
106 }
107
108
109 private TokenizerModel getTokenizerModel
110 () {
111 if (tokenizerModel == null) {
112 try {
113 InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream(RESOURCEPATH + "en-token.bin");
114 try {
115 tokenizerModel = new TokenizerModel(modelIn);
116 } finally {
117 modelIn.close();
118 }
119 } catch (IOException e) {
120 log.error("", e);
121 }
122 }
123
124 return tokenizerModel;
125 }
126
127 }