001    /***************************************************************************/
002    /*  Copyright (C) 2010-2011, Sebastian Hellmann                            */
003    /*  Note: If you need parts of NLP2RDF in another licence due to licence   */
004    /*  incompatibility, please mail hellmann@informatik.uni-leipzig.de        */
005    /*                                                                         */
006    /*  This file is part of NLP2RDF.                                          */
007    /*                                                                         */
008    /*  NLP2RDF is free software; you can redistribute it and/or modify        */
009    /*  it under the terms of the GNU General Public License as published by   */
010    /*  the Free Software Foundation; either version 3 of the License, or      */
011    /*  (at your option) any later version.                                    */
012    /*                                                                         */
013    /*  NLP2RDF is distributed in the hope that it will be useful,             */
014    /*  but WITHOUT ANY WARRANTY; without even the implied warranty of         */
015    /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the           */
016    /*  GNU General Public License for more details.                           */
017    /*                                                                         */
018    /*  You should have received a copy of the GNU General Public License      */
019    /*  along with this program. If not, see <http://www.gnu.org/licenses/>.   */
020    /***************************************************************************/
021    
022    package org.nlp2rdf.core.impl;
023    
024    import opennlp.tools.sentdetect.SentenceDetectorME;
025    import opennlp.tools.sentdetect.SentenceModel;
026    import opennlp.tools.tokenize.TokenizerME;
027    import opennlp.tools.tokenize.TokenizerModel;
028    import opennlp.tools.util.Span;
029    import org.nlp2rdf.core.Tokenizer;
030    import org.slf4j.Logger;
031    import org.slf4j.LoggerFactory;
032    
033    import java.io.IOException;
034    import java.io.InputStream;
035    
036    /**
037     * User: Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
038     */
039    public class OpenNLPTokenizer implements Tokenizer {
040    
041        private static Logger log = LoggerFactory.getLogger(OpenNLPTokenizer.class);
042    
043        //the model is threadsafe according to the javadoc
044        private static TokenizerModel tokenizerModel = null;
045        private static SentenceModel sentenceModel = null;
046    
047        private TokenizerME tokenizerME = null;
048        private SentenceDetectorME sentenceDetectorME = null;
049    
050        public OpenNLPTokenizer() {
051            tokenizerME = new TokenizerME(getTokenizerModel());
052            sentenceDetectorME = new SentenceDetectorME(getSentenceModel());
053        }
054    
055        @Override
056        public synchronized Span[] detectSentences(String text) {
057            return sentenceDetectorME.sentPosDetect(text);
058        }
059    
060        @Override
061        public synchronized Span[] detectWords(String sentence) {
062            //the Tokenizer is not Thread-safe!
063            return tokenizerME.tokenizePos(sentence);
064        }
065    
066    
067        private SentenceModel getSentenceModel() {
068            if (sentenceModel == null) {
069                try {
070                    InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream("opennlp/en-sent.bin");
071                    try {
072                        sentenceModel = new SentenceModel(modelIn);
073                    } finally {
074                        modelIn.close();
075                    }
076                } catch (IOException e) {
077                    log.error("", e);
078                }
079            }
080            return sentenceModel;
081        }
082    
083    
084        private TokenizerModel getTokenizerModel
085                () {
086            if (tokenizerModel == null) {
087                try {
088                    InputStream modelIn = this.getClass().getClassLoader().getResourceAsStream("opennlp/en-token.bin");
089                    try {
090                        tokenizerModel = new TokenizerModel(modelIn);
091                    } finally {
092                        modelIn.close();
093                    }
094                } catch (IOException e) {
095                    log.error("", e);
096                }
097            }
098    
099            return tokenizerModel;
100        }
101    
102    }