001 /***************************************************************************/
002 /* Copyright (C) 2010-2011, Sebastian Hellmann */
003 /* Note: If you need parts of NLP2RDF in another licence due to licence */
004 /* incompatibility, please mail hellmann@informatik.uni-leipzig.de */
005 /* */
006 /* This file is part of NLP2RDF. */
007 /* */
008 /* NLP2RDF is free software; you can redistribute it and/or modify */
009 /* it under the terms of the GNU General Public License as published by */
010 /* the Free Software Foundation; either version 3 of the License, or */
011 /* (at your option) any later version. */
012 /* */
013 /* NLP2RDF is distributed in the hope that it will be useful, */
014 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
015 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
016 /* GNU General Public License for more details. */
017 /* */
018 /* You should have received a copy of the GNU General Public License */
019 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
020 /***************************************************************************/
021
022 package org.nlp2rdf.core.impl;
023
024 import opennlp.tools.sentdetect.SentenceDetectorME;
025 import opennlp.tools.sentdetect.SentenceModel;
026 import opennlp.tools.tokenize.TokenizerME;
027 import opennlp.tools.tokenize.TokenizerModel;
028 import opennlp.tools.util.Span;
029 import org.nlp2rdf.core.Tokenizer;
030 import org.slf4j.Logger;
031 import org.slf4j.LoggerFactory;
032
033 import java.io.IOException;
034 import java.io.InputStream;
035
036 /**
037 * User: Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
038 */
039 public class OpenNLPTokenizer implements Tokenizer {
040
041 private static Logger log = LoggerFactory.getLogger(OpenNLPTokenizer.class);
042
043 //the model is threadsafe according to the javadoc
044 private static TokenizerModel tokenizerModel = null;
045 private static SentenceModel sentenceModel = null;
046
047 private TokenizerME tokenizerME = null;
048 private SentenceDetectorME sentenceDetectorME = null;
049
050 public OpenNLPTokenizer() {
051 tokenizerME = new TokenizerME(getTokenizerModel());
052 sentenceDetectorME = new SentenceDetectorME(getSentenceModel());
053 }
054
055 @Override
056 public synchronized Span[] detectSentences(String text) {
057 return sentenceDetectorME.sentPosDetect(text);
058 }
059
060 @Override
061 public synchronized Span[] detectWords(String sentence) {
062 //the Tokenizer is not Thread-safe!
063 return tokenizerME.tokenizePos(sentence);
064 }
065
066
067 private SentenceModel getSentenceModel() {
068 if (sentenceModel == null) {
069 try {
070 InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream("opennlp/en-sent.bin");
071 try {
072 sentenceModel = new SentenceModel(modelIn);
073 } finally {
074 modelIn.close();
075 }
076 } catch (IOException e) {
077 log.error("", e);
078 }
079 }
080 return sentenceModel;
081 }
082
083
084 private TokenizerModel getTokenizerModel
085 () {
086 if (tokenizerModel == null) {
087 try {
088 InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream("opennlp/en-token.bin");
089 try {
090 tokenizerModel = new TokenizerModel(modelIn);
091 } finally {
092 modelIn.close();
093 }
094 } catch (IOException e) {
095 log.error("", e);
096 }
097 }
098
099 return tokenizerModel;
100 }
101
102 }