001    /***************************************************************************/
002    /*  Copyright (C) 2010-2011, Sebastian Hellmann                            */
003    /*  Note: If you need parts of NLP2RDF in another licence due to licence   */
004    /*  incompatibility, please mail hellmann@informatik.uni-leipzig.de        */
005    /*                                                                         */
006    /*  This file is part of NLP2RDF.                                          */
007    /*                                                                         */
008    /*  NLP2RDF is free software; you can redistribute it and/or modify        */
009    /*  it under the terms of the GNU General Public License as published by   */
010    /*  the Free Software Foundation; either version 3 of the License, or      */
011    /*  (at your option) any later version.                                    */
012    /*                                                                         */
013    /*  NLP2RDF is distributed in the hope that it will be useful,             */
014    /*  but WITHOUT ANY WARRANTY; without even the implied warranty of         */
015    /*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the           */
016    /*  GNU General Public License for more details.                           */
017    /*                                                                         */
018    /*  You should have received a copy of the GNU General Public License      */
019    /*  along with this program. If not, see <http://www.gnu.org/licenses/>.   */
020    /***************************************************************************/
021    
022    package org.nlp2rdf.implementation.snowball;
023    
024    import com.hp.hpl.jena.ontology.OntModel;
025    import eu.lod2.nlp2rdf.schema.sso.StopWord;
026    import eu.lod2.nlp2rdf.schema.sso.Word;
027    import eu.lod2.nlp2rdf.schema.str.Document;
028    import org.nlp2rdf.core.Span;
029    import org.nlp2rdf.core.Text2RDF;
030    import org.nlp2rdf.core.URIGenerator;
031    import org.nlp2rdf.core.util.URIGeneratorHelper;
032    import org.nlp2rdf.implementation.opennlp.OpenNLPTokenizer;
033    import org.slf4j.Logger;
034    import org.slf4j.LoggerFactory;
035    import org.tartarus.snowball.SnowballProgram;
036    
037    import java.security.InvalidParameterException;
038    import java.util.*;
039    
040    /**
041     * A Wrapper for Tartarus' Snowball Stemmer.
042     * The name of a class from org.tartarus.snowball.ext.  can be given to initialize the stemmer
043     * see: http://lucene.apache.org/java/2_4_0/api/contrib-snowball/index.html
044     * <p/>
045     * This decorator attaches the stem to each :Word it finds.
046     * <p/>
047     * User: Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
048     */
049    public class SnowballStemmer extends SnowballProgram {
050        private static Logger log = LoggerFactory.getLogger(SnowballStemmer.class);
051    
052        public SnowballProgram decoratee;
053        private final OpenNLPTokenizer openNLPTokenizer;
054    
055    
056        private final Set<String> stopWords = new HashSet<String>(Arrays.asList(new String[]{"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "would", "should", "could", "ought", "i'm", "you're", "he's", "she's", "it's", "we're", "they're", "i've", "you've", "we've", "they've", "i'd", "you'd", "he'd", "she'd", "we'd", "they'd", "i'll", "you'll", "he'll", "she'll", "we'll", "they'll", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", "hadn't", "doesn't", "don't", "didn't", "won't", "wouldn't", "shan't", "shouldn't", "can't", "cannot", "couldn't", "mustn't", "let's", "that's", "who's", "what's", "here's", "there's", "when's", "where's", "why's", "how's", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very"}));
057    
058        /**
059         * For the English PorterStemmer
060         */
061        public SnowballStemmer() {
062            this("PorterStemmer");
063    
064        }
065    
066        /**
067         * @param stemmerClass a class from the following list http://lucene.apache.org/java/2_4_0/api/contrib-snowball/index.html
068         */
069        public SnowballStemmer(String stemmerClass) {
070            openNLPTokenizer = new OpenNLPTokenizer();
071            try {
072                decoratee = (SnowballProgram) Class.forName("org.tartarus.snowball.ext." + stemmerClass).newInstance();
073            } catch (Exception e) {
074                String msg = "Correct class was not given please use e.g. \"PorterStemmer\"  from: http://lucene.apache.org/java/2_4_0/api/contrib-snowball/index.html\n" + "Received: " + stemmerClass + " transformed to org.tartarus.snowball.ext." + stemmerClass;
075                log.error(msg, e);
076                throw new InvalidParameterException(msg);
077            }
078    
079        }
080    
081        public void processText(String prefix, URIGenerator urigenerator, String text, OntModel model) {
082            TreeMap<Span, List<Span>> tokenizedText = openNLPTokenizer.tokenizeText(text);
083            Text2RDF text2RDF = new Text2RDF();
084            Document document = text2RDF.createDocumentAnnotation(prefix, text, urigenerator, model);
085            text2RDF.generateNIFModel(prefix, text, tokenizedText, urigenerator, document, model);
086            processNIFModel(model);
087            //add additional data
088            new Text2RDF().addNextAndPreviousProperties(prefix, text, urigenerator, model);
089        }
090    
091        public void processNIFModel(OntModel model) {
092            for (Word w : Word.list(model)) {
093                try {
094                    w.addStem(stem(w.getAnchorOf()).toLowerCase());
095                    if (stopWords.contains(w.getAnchorOf())) {
096                        StopWord.create(w.getURI(), model);
097                    }
098    
099                } catch (Exception e) {
100                    log.warn("Stemming failed for " + w.getAnchorOf() + ", " + w.getURI(), e);
101                }
102            }
103        }
104    
105    
106        public String stem(String token) {
107            decoratee.setCurrent(token);
108            decoratee.stem();
109            return decoratee.getCurrent();
110        }
111    
112        @Override
113        public boolean stem() {
114            return decoratee.stem();
115        }
116    
117    
118    }