001    /***************************************************************************/
002    /*  Copyright (C) 2010-2011, Sebastian Hellmann                            */
003    /*  Note: If you need parts of NLP2RDF in another licence due to licence   */
004    /*  incompatibility, please mail hellmann@informatik.uni-leipzig.de        */
005    /*                                                                         */
006    /*  This file is part of NLP2RDF.                                          */
007    /*                                                                         */
008    /*  NLP2RDF is free software; you can redistribute it and/or modify        */
009    /*  it under the terms of the GNU General Public License as published by   */
010    /*  the Free Software Foundation; either version 3 of the License, or      */
011    /*  (at your option) any later version.                                    */
012    /*                                                                         */
013    /*  NLP2RDF is distributed in the hope that it will be useful,             */
014    /*  but WITHOUT ANY WARRANTY; without even the implied warranty of         */
015    /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the           */
016    /*  GNU General Public License for more details.                           */
017    /*                                                                         */
018    /*  You should have received a copy of the GNU General Public License      */
019    /*  along with this program. If not, see <http://www.gnu.org/licenses/>.   */
020    /***************************************************************************/
021    
022    package org.nlp2rdf.implementation.snowball;
023    
024    import com.hp.hpl.jena.ontology.OntModel;
025    import eu.lod2.nlp2rdf.schema.sso.Sentence;
026    import eu.lod2.nlp2rdf.schema.sso.StopWord;
027    import eu.lod2.nlp2rdf.schema.sso.Word;
028    import eu.lod2.nlp2rdf.schema.str.Document;
029    import opennlp.tools.sentdetect.SentenceSample;
030    import org.nlp2rdf.core.Text2RDF;
031    import org.nlp2rdf.core.URIGenerator;
032    import org.nlp2rdf.core.impl.OpenNLPTokenizer;
033    import org.nlp2rdf.core.util.URIGeneratorHelper;
034    import org.slf4j.Logger;
035    import org.slf4j.LoggerFactory;
036    import org.tartarus.snowball.SnowballProgram;
037    
038    import java.security.InvalidParameterException;
039    import java.util.Arrays;
040    import java.util.HashSet;
041    import java.util.Set;
042    
043    /**
044     * A Wrapper for Tartarus' Snowball Stemmer.
045     * The name of a class from org.tartarus.snowball.ext.  can be given to initialize the stemmer
046     * see: http://lucene.apache.org/java/2_4_0/api/contrib-snowball/index.html
047     * <p/>
048     * This decorator attaches the stem to each :Word it finds.
049     * <p/>
050     * User: Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
051     */
052    public class SnowballStemmer extends SnowballProgram {
053        private static Logger log = LoggerFactory.getLogger(SnowballStemmer.class);
054    
055        public SnowballProgram decoratee;
056    
057        private final Set<String> stopWords = new HashSet<String>(Arrays.asList(new String[]{"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "would", "should", "could", "ought", "i'm", "you're", "he's", "she's", "it's", "we're", "they're", "i've", "you've", "we've", "they've", "i'd", "you'd", "he'd", "she'd", "we'd", "they'd", "i'll", "you'll", "he'll", "she'll", "we'll", "they'll", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", "hadn't", "doesn't", "don't", "didn't", "won't", "wouldn't", "shan't", "shouldn't", "can't", "cannot", "couldn't", "mustn't", "let's", "that's", "who's", "what's", "here's", "there's", "when's", "where's", "why's", "how's", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very"}));
058    
059        /**
060         * For the English PorterStemmer
061         */
062        public SnowballStemmer() {
063            this("PorterStemmer");
064        }
065    
066        /**
067         * @param stemmerClass a class from the following list http://lucene.apache.org/java/2_4_0/api/contrib-snowball/index.html
068         */
069        public SnowballStemmer(String stemmerClass) {
070            try {
071                decoratee = (SnowballProgram) Class.forName("org.tartarus.snowball.ext." + stemmerClass).newInstance();
072            } catch (Exception e) {
073                String msg = "Correct class was not given please use e.g. \"PorterStemmer\"  from: http://lucene.apache.org/java/2_4_0/api/contrib-snowball/index.html\n" + "Received: " + stemmerClass + " transformed to org.tartarus.snowball.ext." + stemmerClass;
074                log.error(msg, e);
075                throw new InvalidParameterException(msg);
076            }
077    
078        }
079    
080        public void processText(String prefix, String urirecipe, String text, OntModel diff) {
081            //make a NIF model to work on
082            URIGenerator uriGenerator = URIGeneratorHelper.determineGenerator(urirecipe);
083    
084            Text2RDF t = new Text2RDF();
085            Document d = t.createDocumentAnnotation(prefix, text, uriGenerator, diff);
086            //TODO this is not the best solution, should be separated
087            diff.add(t.process(prefix, text, new OpenNLPTokenizer(), uriGenerator));
088            execute(diff, diff);
089    
090            //add substring annotation
091            for (Sentence s : Sentence.list(diff)) {
092                d.addSubString(s);
093            }
094    
095        }
096    
097        public void execute(OntModel in, OntModel diff) {
098            for (Word w : Word.list(in)) {
099                try {
100                    Word n = Word.create(w.getURI(), diff);
101                    n.addStem(stem(w.getAnchorOf()).toLowerCase());
102                    if (stopWords.contains(n.getAnchorOf())) {
103                        StopWord.create(w.getURI(), diff);
104                    }
105    
106                } catch (Exception e) {
107                    log.warn("Stemming failed for " + w.getAnchorOf() + ", " + w.getURI(), e);
108                }
109            }
110        }
111    
112    
113        public String stem(String token) {
114            decoratee.setCurrent(token);
115            decoratee.stem();
116            return decoratee.getCurrent();
117        }
118    
119        @Override
120        public boolean stem() {
121            return decoratee.stem();
122        }
123    
124    
125    }