package edu.northwestern.at.utils.corpuslinguistics.tokenizer;

import edu.northwestern.at.utils.CharUtils;
import edu.northwestern.at.utils.IsCloseable;
import edu.northwestern.at.utils.IsCloseableObject;
import edu.northwestern.at.utils.RomanNumeralUtils;
import edu.northwestern.at.utils.SetUtils;
import edu.northwestern.at.utils.SingleTagTaggedStrings;
import edu.northwestern.at.utils.TaggedStrings;
import edu.northwestern.at.utils.logger.DummyLogger;
import edu.northwestern.at.utils.logger.Logger;
import edu.northwestern.at.utils.logger.UsesLogger;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/northwestern/at/utils/corpuslinguistics/tokenizer/AbstractWordTokenizer.class */
public abstract class AbstractWordTokenizer extends IsCloseableObject implements WordTokenizer, IsCloseable, UsesLogger {
    protected TaggedStrings contractions;
    protected Logger logger;
    protected static final Pattern hyphensPattern = Pattern.compile("^([-‑]{2,})$");
    protected static final Matcher hyphensMatcher = hyphensPattern.matcher("");
    protected String contractionsURL = "resources/contractions.txt";
    protected boolean coalesceHyphens = false;
    protected boolean coalesceAsterisks = true;
    protected boolean apostropheCanBeQuote = true;
    protected PreTokenizer preTokenizer = new PreTokenizerFactory().newPreTokenizer();

    public AbstractWordTokenizer() {
        loadContractions();
        this.logger = new DummyLogger();
    }

    @Override // edu.northwestern.at.utils.logger.UsesLogger
    public Logger getLogger() {
        return this.logger;
    }

    @Override // edu.northwestern.at.utils.logger.UsesLogger
    public void setLogger(Logger logger) {
        this.logger = logger;
    }

    @Override // edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer
    public PreTokenizer getPreTokenizer() {
        return this.preTokenizer;
    }

    @Override // edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer
    public void setPreTokenizer(PreTokenizer preTokenizer) {
        this.preTokenizer = preTokenizer;
    }

    protected void loadContractions() {
        try {
            Set<String> loadSet = SetUtils.loadSet(DefaultWordTokenizer.class.getResource(this.contractionsURL), "utf-8");
            this.contractions = new SingleTagTaggedStrings((String[]) loadSet.toArray(new String[loadSet.size()]), "1");
        } catch (Exception e) {
        }
    }

    @Override // edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer
    public String preprocessToken(String str, List<String> list) {
        return str;
    }

    public boolean isSingleOpeningQuote(char c) {
        return c == 8216 || (c == '\'' && this.apostropheCanBeQuote);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean isLetterOrSingleQuote(char c) {
        return CharUtils.isLetter(c) || c == 8216 || c == '\'';
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public boolean isClosingQuote(char c) {
        return c == 8217 || c == 8221 || (c == '\'' && this.apostropheCanBeQuote);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String[] splitToken(String str) {
        String[] strArr = {str};
        int indexOf = str.indexOf(".");
        if (indexOf >= 0 && !CharUtils.isCurrency(str) && !CharUtils.isAllPeriods(str) && !str.endsWith(".") && !Abbreviations.isKnownAbbreviation(str) && !CharUtils.isNumber(str) && !RomanNumeralUtils.isLooseRomanNumeral(str)) {
            String substring = str.substring(0, indexOf + 1);
            String substring2 = str.substring(indexOf + 1);
            strArr = Abbreviations.isKnownAbbreviation(substring) ? new String[]{substring, substring2} : new String[]{substring.substring(0, substring.length() - 1), ".", substring2};
        }
        return strArr;
    }

    @Override // edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer
    public void addWordToSentence(List<String> list, String str) {
        list.add(str);
    }

    @Override // edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer
    public int[] findWordOffsets(String str, List<?> list) {
        int size = list.size();
        int[] iArr = new int[size + 1];
        str.length();
        int i = 0;
        for (int i2 = 0; i2 < size; i2++) {
            String obj = list.get(i2).toString();
            while (CharUtils.isWhitespace(str.charAt(i))) {
                i++;
            }
            iArr[i2] = i;
            int length = obj.length();
            int i3 = 0;
            while (i3 < length) {
                if (!CharUtils.isWhitespace(str.charAt(i))) {
                    i3++;
                }
                i++;
            }
        }
        iArr[size] = str.length();
        return iArr;
    }

    public boolean isMultipleHyphens(String str) {
        hyphensMatcher.reset(str);
        return hyphensMatcher.matches();
    }

    @Override // edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer
    public abstract List<String> extractWords(String str);
}
