001 /***************************************************************************/
002 /* Copyright (C) 2010-2011, Sebastian Hellmann */
003 /* Note: If you need parts of NLP2RDF in another licence due to licence */
004 /* incompatibility, please mail hellmann@informatik.uni-leipzig.de */
005 /* */
006 /* This file is part of NLP2RDF. */
007 /* */
008 /* NLP2RDF is free software; you can redistribute it and/or modify */
009 /* it under the terms of the GNU General Public License as published by */
010 /* the Free Software Foundation; either version 3 of the License, or */
011 /* (at your option) any later version. */
012 /* */
013 /* NLP2RDF is distributed in the hope that it will be useful, */
014 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
015 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
016 /* GNU General Public License for more details. */
017 /* */
018 /* You should have received a copy of the GNU General Public License */
019 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
020 /***************************************************************************/
021
022 package org.nlp2rdf.implementation.snowball;
023
024 import com.hp.hpl.jena.ontology.OntModel;
025 import eu.lod2.nlp2rdf.schema.sso.Sentence;
026 import eu.lod2.nlp2rdf.schema.sso.StopWord;
027 import eu.lod2.nlp2rdf.schema.sso.Word;
028 import eu.lod2.nlp2rdf.schema.str.Document;
029 import opennlp.tools.sentdetect.SentenceSample;
030 import org.nlp2rdf.core.Text2RDF;
031 import org.nlp2rdf.core.URIGenerator;
032 import org.nlp2rdf.core.impl.OpenNLPTokenizer;
033 import org.nlp2rdf.core.util.URIGeneratorHelper;
034 import org.slf4j.Logger;
035 import org.slf4j.LoggerFactory;
036 import org.tartarus.snowball.SnowballProgram;
037
038 import java.security.InvalidParameterException;
039 import java.util.Arrays;
040 import java.util.HashSet;
041 import java.util.Set;
042
043 /**
044 * A Wrapper for Tartarus' Snowball Stemmer.
045 * The name of a class from org.tartarus.snowball.ext. can be given to initialize the stemmer
046 * see: http://lucene.apache.org/java/2_4_0/api/contrib-snowball/index.html
047 * <p/>
048 * This decorator attaches the stem to each :Word it finds.
049 * <p/>
050 * User: Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
051 */
052 public class SnowballStemmer extends SnowballProgram {
053 private static Logger log = LoggerFactory.getLogger(SnowballStemmer.class);
054
055 public SnowballProgram decoratee;
056
057 private final Set<String> stopWords = new HashSet<String>(Arrays.asList(new String[]{"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "would", "should", "could", "ought", "i'm", "you're", "he's", "she's", "it's", "we're", "they're", "i've", "you've", "we've", "they've", "i'd", "you'd", "he'd", "she'd", "we'd", "they'd", "i'll", "you'll", "he'll", "she'll", "we'll", "they'll", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", "hadn't", "doesn't", "don't", "didn't", "won't", "wouldn't", "shan't", "shouldn't", "can't", "cannot", "couldn't", "mustn't", "let's", "that's", "who's", "what's", "here's", "there's", "when's", "where's", "why's", "how's", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very"}));
058
059 /**
060 * For the English PorterStemmer
061 */
062 public SnowballStemmer() {
063 this("PorterStemmer");
064 }
065
066 /**
067 * @param stemmerClass a class from the following list http://lucene.apache.org/java/2_4_0/api/contrib-snowball/index.html
068 */
069 public SnowballStemmer(String stemmerClass) {
070 try {
071 decoratee = (SnowballProgram) Class.forName("org.tartarus.snowball.ext." + stemmerClass).newInstance();
072 } catch (Exception e) {
073 String msg = "Correct class was not given please use e.g. \"PorterStemmer\" from: http://lucene.apache.org/java/2_4_0/api/contrib-snowball/index.html\n" + "Received: " + stemmerClass + " transformed to org.tartarus.snowball.ext." + stemmerClass;
074 log.error(msg, e);
075 throw new InvalidParameterException(msg);
076 }
077
078 }
079
080 public void processText(String prefix, String urirecipe, String text, OntModel diff) {
081 //make a NIF model to work on
082 URIGenerator uriGenerator = URIGeneratorHelper.determineGenerator(urirecipe);
083
084 Text2RDF t = new Text2RDF();
085 Document d = t.createDocumentAnnotation(prefix, text, uriGenerator, diff);
086 //TODO this is not the best solution, should be separated
087 diff.add(t.process(prefix, text, new OpenNLPTokenizer(), uriGenerator));
088 execute(diff, diff);
089
090 //add substring annotation
091 for (Sentence s : Sentence.list(diff)) {
092 d.addSubString(s);
093 }
094
095 }
096
097 public void execute(OntModel in, OntModel diff) {
098 for (Word w : Word.list(in)) {
099 try {
100 Word n = Word.create(w.getURI(), diff);
101 n.addStem(stem(w.getAnchorOf()).toLowerCase());
102 if (stopWords.contains(n.getAnchorOf())) {
103 StopWord.create(w.getURI(), diff);
104 }
105
106 } catch (Exception e) {
107 log.warn("Stemming failed for " + w.getAnchorOf() + ", " + w.getURI(), e);
108 }
109 }
110 }
111
112
113 public String stem(String token) {
114 decoratee.setCurrent(token);
115 decoratee.stem();
116 return decoratee.getCurrent();
117 }
118
119 @Override
120 public boolean stem() {
121 return decoratee.stem();
122 }
123
124
125 }