package edu.northwestern.at.utils.corpuslinguistics.tokenizer;

import edu.northwestern.at.utils.CharUtils;
import edu.northwestern.at.utils.ListFactory;
import edu.northwestern.at.utils.RomanNumeralUtils;
import java.util.List;
import java.util.StringTokenizer;

/* loaded from: input_file:edu/northwestern/at/utils/corpuslinguistics/tokenizer/DefaultWordTokenizer.class */
public class DefaultWordTokenizer extends AbstractWordTokenizer implements WordTokenizer {
    @Override // edu.northwestern.at.utils.corpuslinguistics.tokenizer.AbstractWordTokenizer, edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer
    public List<String> extractWords(String str) {
        List<String> createNewList = ListFactory.createNewList();
        StringTokenizer stringTokenizer = new StringTokenizer(this.preTokenizer.pretokenize(str));
        while (stringTokenizer.hasMoreTokens()) {
            String preprocessToken = preprocessToken(stringTokenizer.nextToken(), createNewList);
            boolean z = false;
            while (preprocessToken.length() > 0 && !z) {
                hyphensMatcher.reset(preprocessToken);
                if (hyphensMatcher.matches()) {
                    z = true;
                } else if (preprocessToken.charAt(0) == '&') {
                    if (this.contractions.containsString(preprocessToken)) {
                        z = true;
                    } else {
                        addWordToSentence(createNewList, "&");
                        preprocessToken = preprocessToken.substring(1);
                    }
                } else if (!this.apostropheCanBeQuote && CharUtils.isApostrophe(preprocessToken.charAt(0))) {
                    z = true;
                } else if (isSingleOpeningQuote(preprocessToken.charAt(0))) {
                    if (this.contractions.containsString(preprocessToken)) {
                        z = true;
                    } else {
                        addWordToSentence(createNewList, preprocessToken.charAt(0) + "");
                        preprocessToken = preprocessToken.substring(1);
                    }
                } else if (CharUtils.isOpeningQuote(preprocessToken.charAt(0))) {
                    addWordToSentence(createNewList, preprocessToken.charAt(0) + "");
                    preprocessToken = preprocessToken.substring(1);
                } else if (preprocessToken.charAt(0) == '%') {
                    addWordToSentence(createNewList, "%");
                    preprocessToken = preprocessToken.substring(1);
                } else if (preprocessToken.charAt(0) == '*') {
                    addWordToSentence(createNewList, "*");
                    preprocessToken = preprocessToken.substring(1);
                } else if (!CharUtils.isBreakingDash(preprocessToken.charAt(0))) {
                    z = true;
                } else if (preprocessToken.length() <= 1) {
                    z = true;
                } else if (CharUtils.isNumber(preprocessToken.substring(1))) {
                    z = true;
                } else {
                    addWordToSentence(createNewList, preprocessToken.charAt(0) + "");
                    preprocessToken = preprocessToken.substring(1);
                }
            }
            String str2 = "";
            hyphensMatcher.reset(preprocessToken);
            if (!hyphensMatcher.matches()) {
                boolean z2 = false;
                for (int length = preprocessToken.length(); length > 1 && !z2; length = preprocessToken.length()) {
                    if (preprocessToken.charAt(length - 1) == ':' || CharUtils.isBreakingDash(preprocessToken.charAt(length - 1)) || isClosingQuote(preprocessToken.charAt(length - 1))) {
                        str2 = preprocessToken.charAt(length - 1) + str2;
                        preprocessToken = preprocessToken.substring(0, preprocessToken.length() - 1);
                    } else if (preprocessToken.charAt(length - 1) == '!' || preprocessToken.charAt(length - 1) == '?') {
                        if (Abbreviations.isAbbreviation(preprocessToken)) {
                            z2 = true;
                        } else {
                            str2 = preprocessToken.charAt(length - 1) + str2;
                            preprocessToken = preprocessToken.substring(0, preprocessToken.length() - 1);
                        }
                    } else if (preprocessToken.charAt(length - 1) != '.') {
                        z2 = true;
                    } else if (preprocessToken.charAt(0) == '$') {
                        if (preprocessToken.length() > 2 && preprocessToken.substring(1, length - 2).indexOf(46) >= 0) {
                            str2 = "." + str2;
                            preprocessToken = preprocessToken.substring(0, preprocessToken.length() - 1);
                        }
                        z2 = true;
                    } else if (isLetterOrSingleQuote(preprocessToken.charAt(length - 2))) {
                        if (Abbreviations.isAbbreviation(preprocessToken)) {
                            z2 = true;
                        } else {
                            str2 = "." + str2;
                            preprocessToken = preprocessToken.substring(0, preprocessToken.length() - 1);
                        }
                    } else if (CharUtils.isAllPeriods(preprocessToken)) {
                        z2 = true;
                    } else if (CharUtils.isNumber(preprocessToken)) {
                        z2 = true;
                    } else if (RomanNumeralUtils.isLooseRomanNumeral(preprocessToken)) {
                        z2 = true;
                    } else {
                        str2 = "." + str2;
                        preprocessToken = preprocessToken.substring(0, preprocessToken.length() - 1);
                    }
                }
            }
            if (isMultipleHyphens(preprocessToken) && createNewList.size() > 0) {
                String str3 = createNewList.get(createNewList.size() - 1);
                if (CharUtils.isLetter(str3)) {
                    createNewList.set(createNewList.size() - 1, str3 + preprocessToken);
                    preprocessToken = "";
                }
            }
            if ((preprocessToken.equals("'s") || preprocessToken.equals("'S")) && createNewList.size() > 0) {
                String str4 = createNewList.get(createNewList.size() - 1);
                if (str4.matches("([A-Za-z]){0,1}(--|---|‑‑|‑‑‑|(\\*+))")) {
                    createNewList.set(createNewList.size() - 1, str4 + preprocessToken);
                    preprocessToken = "";
                }
            }
            if (preprocessToken.length() > 0) {
                String[] splitToken = splitToken(preprocessToken);
                for (int i = 0; i < splitToken.length; i++) {
                    if (splitToken[i].length() > 0) {
                        addWordToSentence(createNewList, splitToken[i]);
                    }
                }
            }
            for (int i2 = 0; i2 < str2.length(); i2++) {
                addWordToSentence(createNewList, str2.charAt(i2) + "");
            }
        }
        return createNewList;
    }

    @Override // edu.northwestern.at.utils.corpuslinguistics.tokenizer.AbstractWordTokenizer, edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer
    public void addWordToSentence(List<String> list, String str) {
        if (list.size() > 0 && ((this.coalesceHyphens && str.equals("-")) || (this.coalesceAsterisks && str.equals("*")))) {
            String str2 = list.get(list.size() - 1);
            if (str2.endsWith(str)) {
                list.set(list.size() - 1, str2 + str);
                return;
            } else {
                list.add(str);
                return;
            }
        }
        if (!str.equals(".") || list.size() <= 0) {
            list.add(str);
            return;
        }
        String str3 = list.get(list.size() - 1);
        if (str3.charAt(0) == '.' && RomanNumeralUtils.isLooseRomanNumeral(str3)) {
            list.set(list.size() - 1, str3 + ".");
        } else {
            list.add(str);
        }
    }
}
