package edu.northwestern.at.morphadorner.tools.relemmatize;

import edu.northwestern.at.morphadorner.WordAttributeNames;
import edu.northwestern.at.utils.CharUtils;
import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.Lemmatizer;
import edu.northwestern.at.utils.corpuslinguistics.lexicon.Lexicon;
import edu.northwestern.at.utils.corpuslinguistics.namestandardizer.NameStandardizer;
import edu.northwestern.at.utils.corpuslinguistics.partsofspeech.PartOfSpeechTags;
import edu.northwestern.at.utils.corpuslinguistics.spellingmapper.SpellingMapper;
import edu.northwestern.at.utils.corpuslinguistics.spellingstandardizer.SpellingStandardizer;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.PennTreebankTokenizer;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer;
import edu.northwestern.at.utils.xml.ExtendedXMLFilterImpl;
import java.util.List;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.AttributesImpl;

/* loaded from: input_file:edu/northwestern/at/morphadorner/tools/relemmatize/RelemmatizeFilter.class */
public class RelemmatizeFilter extends ExtendedXMLFilterImpl {
    protected Lexicon wordLexicon;
    protected Lemmatizer lemmatizer;
    protected NameStandardizer nameStandardizer;
    protected SpellingStandardizer standardizer;
    protected SpellingMapper spellingMapper;
    protected PartOfSpeechTags partOfSpeechTags;
    protected WordTokenizer spellingTokenizer;
    protected String lemmaSeparator;
    protected int lemmataChanged;
    protected int standardChanged;
    protected int wordsProcessed;

    public RelemmatizeFilter(XMLReader xMLReader, Lexicon lexicon, Lemmatizer lemmatizer, NameStandardizer nameStandardizer, SpellingStandardizer spellingStandardizer, SpellingMapper spellingMapper) {
        super(xMLReader);
        this.lemmataChanged = 0;
        this.standardChanged = 0;
        this.wordsProcessed = 0;
        this.wordLexicon = lexicon;
        this.lemmatizer = lemmatizer;
        this.nameStandardizer = nameStandardizer;
        this.standardizer = spellingStandardizer;
        this.spellingMapper = spellingMapper;
        this.lemmaSeparator = lemmatizer.getLemmaSeparator();
        this.partOfSpeechTags = lexicon.getPartOfSpeechTags();
        this.spellingTokenizer = new PennTreebankTokenizer();
    }

    @Override // org.xml.sax.helpers.XMLFilterImpl, org.xml.sax.ContentHandler
    public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
        if (!str3.equals("w")) {
            if (!str3.equals("c")) {
                super.startElement(str, str2, str3, attributes);
                return;
            }
            AttributesImpl attributesImpl = new AttributesImpl();
            removeAttribute(attributesImpl, WordAttributeNames.part);
            super.startElement(str, str2, str3, attributesImpl);
            return;
        }
        this.wordsProcessed++;
        AttributesImpl attributesImpl2 = new AttributesImpl(attributes);
        String value = attributesImpl2.getValue(WordAttributeNames.lem);
        String value2 = attributesImpl2.getValue(WordAttributeNames.pos);
        String value3 = attributesImpl2.getValue(WordAttributeNames.reg);
        String value4 = attributesImpl2.getValue(WordAttributeNames.spe);
        String standardizedSpelling = getStandardizedSpelling(value4, value2);
        String lemma = getLemma(value4, value2);
        setAttributeValue(attributesImpl2, WordAttributeNames.reg, standardizedSpelling);
        setAttributeValue(attributesImpl2, WordAttributeNames.lem, lemma);
        if (!value3.equals(standardizedSpelling)) {
            this.standardChanged++;
        }
        if (!value.equals(lemma)) {
            this.lemmataChanged++;
        }
        super.startElement(str, str2, str3, attributesImpl2);
    }

    public String getLemma(String str, String str2) {
        String lemma = this.wordLexicon.getLemma(str, str2);
        if (lemma.equals("*")) {
            String lemmaWordClass = this.partOfSpeechTags.getLemmaWordClass(str2);
            if (!this.lemmatizer.cantLemmatize(str) && !lemmaWordClass.equals("none")) {
                List<String> extractWords = this.spellingTokenizer.extractWords(str);
                if (this.partOfSpeechTags.isCompoundTag(str2) && extractWords.size() != 1) {
                    lemma = "";
                    String[] splitTag = this.partOfSpeechTags.splitTag(str2);
                    if (splitTag.length == extractWords.size()) {
                        for (int i = 0; i < extractWords.size(); i++) {
                            String str3 = extractWords.get(i);
                            if (i > 0) {
                                lemma = lemma + this.lemmaSeparator;
                            }
                            lemma = lemma + this.lemmatizer.lemmatize(str3, this.partOfSpeechTags.getLemmaWordClass(splitTag[i]));
                        }
                    }
                } else if (lemmaWordClass.length() == 0) {
                    lemma = this.lemmatizer.lemmatize(str, "compound");
                    if (lemma.equals(str)) {
                        lemma = this.lemmatizer.lemmatize(str);
                    }
                } else {
                    lemma = this.lemmatizer.lemmatize(str, lemmaWordClass);
                }
            }
        }
        if (lemma.equals("*")) {
            lemma = str;
        }
        if (lemma.indexOf(this.lemmaSeparator) < 0 && !this.partOfSpeechTags.isProperNounTag(str2)) {
            lemma = lemma.toLowerCase();
        }
        return lemma;
    }

    protected String getStandardizedSpelling(String str, String str2) {
        String str3 = str;
        if (this.partOfSpeechTags.isProperNounTag(str2)) {
            str3 = this.nameStandardizer.standardizeProperName(str);
        } else if ((!this.partOfSpeechTags.isNounTag(str2) || !CharUtils.hasInternalCaps(str)) && !this.partOfSpeechTags.isForeignWordTag(str2) && !this.partOfSpeechTags.isNumberTag(str2)) {
            str3 = this.standardizer.standardizeSpelling(str, this.partOfSpeechTags.getMajorWordClass(str2));
            if (str3.equalsIgnoreCase(str)) {
                str3 = str;
            }
        }
        return this.spellingMapper.mapSpelling(str3);
    }

    public int getLemmataChanged() {
        return this.lemmataChanged;
    }

    public int getStandardChanged() {
        return this.standardChanged;
    }

    public int getWordsProcessed() {
        return this.wordsProcessed;
    }
}
