001    /***************************************************************************/
002    /*  Copyright (C) 2010-2011, Sebastian Hellmann                            */
003    /*  Note: If you need parts of NLP2RDF in another licence due to licence   */
004    /*  incompatibility, please mail hellmann@informatik.uni-leipzig.de        */
005    /*                                                                         */
006    /*  This file is part of NLP2RDF.                                          */
007    /*                                                                         */
008    /*  NLP2RDF is free software; you can redistribute it and/or modify        */
009    /*  it under the terms of the GNU General Public License as published by   */
010    /*  the Free Software Foundation; either version 3 of the License, or      */
011    /*  (at your option) any later version.                                    */
012    /*                                                                         */
013    /*  NLP2RDF is distributed in the hope that it will be useful,             */
014    /*  but WITHOUT ANY WARRANTY; without even the implied warranty of         */
015    /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the           */
016    /*  GNU General Public License for more details.                           */
017    /*                                                                         */
018    /*  You should have received a copy of the GNU General Public License      */
019    /*  along with this program. If not, see <http://www.gnu.org/licenses/>.   */
020    /***************************************************************************/
021    
022    package org.nlp2rdf.implementation.opennlp;
023    
024    import opennlp.tools.sentdetect.SentenceDetectorME;
025    import opennlp.tools.sentdetect.SentenceModel;
026    import opennlp.tools.tokenize.TokenizerME;
027    import opennlp.tools.tokenize.TokenizerModel;
028    import org.nlp2rdf.core.Span;
029    import org.slf4j.Logger;
030    import org.slf4j.LoggerFactory;
031    
032    import java.io.IOException;
033    import java.io.InputStream;
034    import java.lang.reflect.Array;
035    import java.util.ArrayList;
036    import java.util.Collection;
037    import java.util.List;
038    import java.util.TreeMap;
039    
040    /**
041     */
042    public class OpenNLPTokenizer {
043        private static Logger log = LoggerFactory.getLogger(OpenNLPTokenizer.class);
044        public static final String RESOURCEPATH = "org/nlp2rdf/implementation/opennlp/";
045    
046        //the model is threadsafe according to the javadoc
047        private static TokenizerModel tokenizerModel = null;
048        private static SentenceModel sentenceModel = null;
049    
050        private TokenizerME tokenizerME = null;
051        private SentenceDetectorME sentenceDetectorME = null;
052    
053        public OpenNLPTokenizer() {
054            tokenizerME = new TokenizerME(getTokenizerModel());
055            sentenceDetectorME = new SentenceDetectorME(getSentenceModel());
056        }
057    
058        public TreeMap<Span, List<Span>> tokenizeText(String text) {
059            //get all the sentences and words
060            TreeMap<Span, List<Span>> tokenizedText = new TreeMap<Span, List<Span>>();
061            for (Span sentenceSpan : this.detectSentences(text)) {
062                List<Span> wordSpans = new ArrayList<Span>();
063                String sentenceText = sentenceSpan.getCoveredText(text).toString();
064                for (Span wordSpan : this.detectWords(sentenceText)) {
065                    wordSpans.add(new Span(wordSpan, sentenceSpan.getStart()));
066                }
067                tokenizedText.put(sentenceSpan, wordSpans);
068            }
069            return tokenizedText;
070        }
071    
072        public synchronized Span[] detectSentences(String text) {
073            opennlp.tools.util.Span[] spans = sentenceDetectorME.sentPosDetect(text);
074            SpanDecorator[] ret = new SpanDecorator[spans.length];
075            for (int i = 0; i < spans.length; i++) {
076                ret[i] = new SpanDecorator(spans[i]);
077            }
078            return ret;
079        }
080    
081        public synchronized Span[] detectWords(String sentence) {
082            //the Tokenizer is not Thread-safe!
083            opennlp.tools.util.Span[] spans = tokenizerME.tokenizePos(sentence);
084            SpanDecorator[] ret = new SpanDecorator[spans.length];
085            for (int i = 0; i < spans.length; i++) {
086                ret[i] = new SpanDecorator(spans[i]);
087            }
088            return ret;
089        }
090    
091    
092        private SentenceModel getSentenceModel() {
093            if (sentenceModel == null) {
094                try {
095                    InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream(RESOURCEPATH + "en-sent.bin");
096                    try {
097                        sentenceModel = new SentenceModel(modelIn);
098                    } finally {
099                        modelIn.close();
100                    }
101                } catch (IOException e) {
102                    log.error("", e);
103                }
104            }
105            return sentenceModel;
106        }
107    
108    
109        private TokenizerModel getTokenizerModel
110                () {
111            if (tokenizerModel == null) {
112                try {
113                    InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream(RESOURCEPATH + "en-token.bin");
114                    try {
115                        tokenizerModel = new TokenizerModel(modelIn);
116                    } finally {
117                        modelIn.close();
118                    }
119                } catch (IOException e) {
120                    log.error("", e);
121                }
122            }
123    
124            return tokenizerModel;
125        }
126    
127    }