001 /***************************************************************************/
002 /* Copyright (C) 2010-2011, Sebastian Hellmann */
003 /* Note: If you need parts of NLP2RDF in another licence due to licence */
004 /* incompatibility, please mail hellmann@informatik.uni-leipzig.de */
005 /* */
006 /* This file is part of NLP2RDF. */
007 /* */
008 /* NLP2RDF is free software; you can redistribute it and/or modify */
009 /* it under the terms of the GNU General Public License as published by */
010 /* the Free Software Foundation; either version 3 of the License, or */
011 /* (at your option) any later version. */
012 /* */
013 /* NLP2RDF is distributed in the hope that it will be useful, */
014 /* but WITHOUT ANY WARRANTY; without even the implied warranty of */
015 /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
016 /* GNU General Public License for more details. */
017 /* */
018 /* You should have received a copy of the GNU General Public License */
019 /* along with this program. If not, see <http://www.gnu.org/licenses/>. */
020 /***************************************************************************/
021
022 package org.nlp2rdf.implementation.snowball;
023
024 import com.hp.hpl.jena.ontology.OntModel;
025 import eu.lod2.nlp2rdf.schema.sso.StopWord;
026 import eu.lod2.nlp2rdf.schema.sso.Word;
027 import eu.lod2.nlp2rdf.schema.str.Document;
028 import org.nlp2rdf.core.Span;
029 import org.nlp2rdf.core.Text2RDF;
030 import org.nlp2rdf.core.URIGenerator;
031 import org.nlp2rdf.core.util.URIGeneratorHelper;
032 import org.nlp2rdf.implementation.opennlp.OpenNLPTokenizer;
033 import org.slf4j.Logger;
034 import org.slf4j.LoggerFactory;
035 import org.tartarus.snowball.SnowballProgram;
036
037 import java.security.InvalidParameterException;
038 import java.util.*;
039
040 /**
041 * A Wrapper for Tartarus' Snowball Stemmer.
042 * The name of a class from org.tartarus.snowball.ext. can be given to initialize the stemmer
043 * see: http://lucene.apache.org/java/2_4_0/api/contrib-snowball/index.html
044 * <p/>
045 * This decorator attaches the stem to each :Word it finds.
046 * <p/>
047 * User: Sebastian Hellmann - http://bis.informatik.uni-leipzig.de/SebastianHellmann
048 */
049 public class SnowballStemmer extends SnowballProgram {
050 private static Logger log = LoggerFactory.getLogger(SnowballStemmer.class);
051
052 public SnowballProgram decoratee;
053 private final OpenNLPTokenizer openNLPTokenizer;
054
055
056 private final Set<String> stopWords = new HashSet<String>(Arrays.asList(new String[]{"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "would", "should", "could", "ought", "i'm", "you're", "he's", "she's", "it's", "we're", "they're", "i've", "you've", "we've", "they've", "i'd", "you'd", "he'd", "she'd", "we'd", "they'd", "i'll", "you'll", "he'll", "she'll", "we'll", "they'll", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't", "hadn't", "doesn't", "don't", "didn't", "won't", "wouldn't", "shan't", "shouldn't", "can't", "cannot", "couldn't", "mustn't", "let's", "that's", "who's", "what's", "here's", "there's", "when's", "where's", "why's", "how's", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very"}));
057
058 /**
059 * For the English PorterStemmer
060 */
061 public SnowballStemmer() {
062 this("PorterStemmer");
063
064 }
065
066 /**
067 * @param stemmerClass a class from the following list http://lucene.apache.org/java/2_4_0/api/contrib-snowball/index.html
068 */
069 public SnowballStemmer(String stemmerClass) {
070 openNLPTokenizer = new OpenNLPTokenizer();
071 try {
072 decoratee = (SnowballProgram) Class.forName("org.tartarus.snowball.ext." + stemmerClass).newInstance();
073 } catch (Exception e) {
074 String msg = "Correct class was not given please use e.g. \"PorterStemmer\" from: http://lucene.apache.org/java/2_4_0/api/contrib-snowball/index.html\n" + "Received: " + stemmerClass + " transformed to org.tartarus.snowball.ext." + stemmerClass;
075 log.error(msg, e);
076 throw new InvalidParameterException(msg);
077 }
078
079 }
080
081 public void processText(String prefix, URIGenerator urigenerator, String text, OntModel model) {
082 TreeMap<Span, List<Span>> tokenizedText = openNLPTokenizer.tokenizeText(text);
083 Text2RDF text2RDF = new Text2RDF();
084 Document document = text2RDF.createDocumentAnnotation(prefix, text, urigenerator, model);
085 text2RDF.generateNIFModel(prefix, text, tokenizedText, urigenerator, document, model);
086 processNIFModel(model);
087 //add additional data
088 new Text2RDF().addNextAndPreviousProperties(prefix, text, urigenerator, model);
089 }
090
091 public void processNIFModel(OntModel model) {
092 for (Word w : Word.list(model)) {
093 try {
094 w.addStem(stem(w.getAnchorOf()).toLowerCase());
095 if (stopWords.contains(w.getAnchorOf())) {
096 StopWord.create(w.getURI(), model);
097 }
098
099 } catch (Exception e) {
100 log.warn("Stemming failed for " + w.getAnchorOf() + ", " + w.getURI(), e);
101 }
102 }
103 }
104
105
106 public String stem(String token) {
107 decoratee.setCurrent(token);
108 decoratee.stem();
109 return decoratee.getCurrent();
110 }
111
112 @Override
113 public boolean stem() {
114 return decoratee.stem();
115 }
116
117
118 }