package edu.northwestern.at.utils.corpuslinguistics.tokenizer;

import com.ibm.icu.text.BreakIterator;
import edu.northwestern.at.utils.ListFactory;
import java.util.List;
import java.util.Locale;

/* loaded from: input_file:edu/northwestern/at/utils/corpuslinguistics/tokenizer/ICU4JBreakIteratorWordTokenizer.class */
public class ICU4JBreakIteratorWordTokenizer extends AbstractWordTokenizer implements WordTokenizer {
    protected Locale locale;

    public ICU4JBreakIteratorWordTokenizer() {
        this.locale = Locale.US;
    }

    public ICU4JBreakIteratorWordTokenizer(Locale locale) {
        this.locale = Locale.US;
        this.locale = locale;
    }

    @Override // edu.northwestern.at.utils.corpuslinguistics.tokenizer.AbstractWordTokenizer, edu.northwestern.at.utils.corpuslinguistics.tokenizer.WordTokenizer
    public List<String> extractWords(String str) {
        List<String> createNewList = ListFactory.createNewList();
        BreakIterator wordInstance = BreakIterator.getWordInstance(this.locale);
        String pretokenize = this.preTokenizer.pretokenize(str);
        wordInstance.setText(pretokenize);
        int first = wordInstance.first();
        int next = wordInstance.next();
        while (true) {
            int i = next;
            if (i == -1) {
                return createNewList;
            }
            String substring = pretokenize.substring(first, i);
            if (!Character.isWhitespace(substring.charAt(0))) {
                String preprocessToken = preprocessToken(substring, createNewList);
                if (preprocessToken.length() > 0) {
                    String[] splitToken = splitToken(preprocessToken);
                    for (int i2 = 0; i2 < splitToken.length; i2++) {
                        if (splitToken[i2].length() > 0) {
                            addWordToSentence(createNewList, splitToken[i2]);
                        }
                    }
                }
            }
            first = i;
            next = wordInstance.next();
        }
    }
}
