public class TreeTaggerWrapper extends AbstractPosTagger implements org.annolab.tt4j.TokenHandler<String>, org.dice_research.topicmodeling.automaton.AutomatonCallback
| Modifier and Type | Field and Description |
|---|---|
static String |
ABBREVIATION_FILE_PROPERTY_KEY |
private Set<String> |
abbreviations |
private org.dice_research.topicmodeling.automaton.BricsAutomatonManager |
automaton |
private String |
currentText |
private org.dice_research.topicmodeling.utils.doc.TermTokenizedText |
currentTTText |
private static org.slf4j.Logger |
LOGGER |
static String |
MODEL_FILE_PROPERTY_KEY |
private org.annolab.tt4j.TreeTaggerWrapper<String> |
posTagger |
private static String[] |
TOKENIZER_PATTERN |
private List<String> |
tokens |
static String |
TREE_TAGGER_HOME_PROPERTY_KEY |
| Modifier | Constructor and Description |
|---|---|
private |
TreeTaggerWrapper(org.annolab.tt4j.TreeTaggerWrapper<String> posTagger,
Set<String> abbreviations) |
| Modifier and Type | Method and Description |
|---|---|
private String |
cleanLemma(String token,
String lemma)
Sometimes, the Treetagger returns more than one lemma.
|
static TreeTaggerWrapper |
createTreeTaggerWrapper() |
void |
foundPattern(int patternId,
int startPos,
int length) |
private void |
setTermPropertiesFromPosTag(org.dice_research.topicmodeling.lang.Term term,
String pos) |
void |
token(String token,
String pos,
String lemma) |
protected org.dice_research.topicmodeling.utils.doc.TermTokenizedText |
tokenizeText(String text) |
protected org.dice_research.topicmodeling.utils.doc.TermTokenizedText |
tokenizeText(String text,
PosTaggingTermFilter filter) |
getFilter, setFilter, tokenizeprivate static final org.slf4j.Logger LOGGER
public static final String TREE_TAGGER_HOME_PROPERTY_KEY
public static final String MODEL_FILE_PROPERTY_KEY
public static final String ABBREVIATION_FILE_PROPERTY_KEY
private static final String[] TOKENIZER_PATTERN
private org.annolab.tt4j.TreeTaggerWrapper<String> posTagger
private org.dice_research.topicmodeling.utils.doc.TermTokenizedText currentTTText
private String currentText
private org.dice_research.topicmodeling.automaton.BricsAutomatonManager automaton
public static TreeTaggerWrapper createTreeTaggerWrapper()
protected org.dice_research.topicmodeling.utils.doc.TermTokenizedText tokenizeText(String text)
tokenizeText in class AbstractPosTaggerprotected org.dice_research.topicmodeling.utils.doc.TermTokenizedText tokenizeText(String text, PosTaggingTermFilter filter)
tokenizeText in class AbstractPosTaggerpublic void token(String token, String pos, String lemma)
token in interface org.annolab.tt4j.TokenHandler<String>private String cleanLemma(String token, String lemma)
private void setTermPropertiesFromPosTag(org.dice_research.topicmodeling.lang.Term term,
String pos)
public void foundPattern(int patternId,
int startPos,
int length)
foundPattern in interface org.dice_research.topicmodeling.automaton.AutomatonCallbackCopyright © 2015–2020. All rights reserved.