package LBJ2.nlp;

import LBJ2.learn.WekaWrapper;
import LBJ2.parse.LineByLine;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:LBJ2/nlp/SentenceSplitter.class */
public class SentenceSplitter extends LineByLine {
    protected int currentOffset;
    protected LinkedList sentences;
    protected int index;
    protected String[] input;

    public static void main(String[] strArr) {
        String str = null;
        try {
            str = strArr[0];
        } catch (Exception e) {
            System.err.println("usage: java LBJ2.nlp.SentenceSplitter <file name>");
            System.exit(1);
        }
        if (strArr.length > 1) {
            throw new Exception();
        }
        SentenceSplitter sentenceSplitter = new SentenceSplitter(str);
        Object next = sentenceSplitter.next();
        while (true) {
            Sentence sentence = (Sentence) next;
            if (sentence == null) {
                return;
            }
            StringBuffer stringBuffer = new StringBuffer(sentence.text);
            for (int i = 0; i < stringBuffer.length(); i++) {
                char charAt = stringBuffer.charAt(i);
                if (charAt == '\n' || charAt == '\r' || charAt == '\f') {
                    stringBuffer.setCharAt(i, ' ');
                }
            }
            System.out.println(stringBuffer);
            next = sentenceSplitter.next();
        }
    }

    public SentenceSplitter(String str) {
        super(str);
        this.sentences = new LinkedList();
    }

    public SentenceSplitter(String[] strArr) {
        this.input = strArr;
        this.sentences = new LinkedList();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // LBJ2.parse.LineByLine
    public String readLine() {
        if (this.input == null) {
            return super.readLine();
        }
        if (this.index >= this.input.length) {
            return null;
        }
        String[] strArr = this.input;
        int i = this.index;
        this.index = i + 1;
        return strArr[i];
    }

    protected String getParagraph() {
        String str;
        String str2 = WekaWrapper.defaultAttributeString;
        String readLine = readLine();
        while (true) {
            str = readLine;
            if (str == null || !str.trim().equals(WekaWrapper.defaultAttributeString)) {
                break;
            }
            str2 = new StringBuffer().append(str2).append(str).append("\n").toString();
            readLine = readLine();
        }
        while (str != null && !str.trim().equals(WekaWrapper.defaultAttributeString)) {
            str2 = new StringBuffer().append(str2).append(str).append("\n").toString();
            str = readLine();
        }
        if (str != null) {
            str2 = new StringBuffer().append(str2).append(str).append("\n").toString();
        }
        return str2;
    }

    @Override // LBJ2.parse.Parser
    public Object next() {
        if (this.sentences.size() == 0) {
            String paragraph = getParagraph();
            if (!paragraph.trim().equals(WekaWrapper.defaultAttributeString)) {
                process(paragraph);
            }
            this.currentOffset += paragraph.length();
        }
        if (this.sentences.size() == 0) {
            return null;
        }
        return this.sentences.removeFirst();
    }

    public Sentence[] splitAll() {
        String paragraph = getParagraph();
        while (true) {
            String str = paragraph;
            if (str.trim().equals(WekaWrapper.defaultAttributeString)) {
                return (Sentence[]) this.sentences.toArray(new Sentence[this.sentences.size()]);
            }
            if (!str.trim().equals(WekaWrapper.defaultAttributeString)) {
                process(str);
            }
            this.currentOffset += str.length();
            paragraph = getParagraph();
        }
    }

    protected void process(String str) {
        if (str.trim().equals(WekaWrapper.defaultAttributeString)) {
            return;
        }
        Matcher matcher = Pattern.compile("([^-\\s]-\n\\s*|\\S)+").matcher(str);
        LinkedList linkedList = new LinkedList();
        while (matcher.find()) {
            linkedList.add(new Word(matcher.group(), matcher.start(), matcher.end() - 1));
        }
        Word[] wordArr = (Word[]) linkedList.toArray(new Word[linkedList.size()]);
        int i = wordArr[0].start;
        boolean z = true;
        for (int i2 = 0; i2 < wordArr.length; i2++) {
            int lastIndexOf = wordArr[i2].form.lastIndexOf(46);
            int lastIndexOf2 = wordArr[i2].form.lastIndexOf(63);
            if (lastIndexOf2 > lastIndexOf) {
                lastIndexOf = lastIndexOf2;
            }
            int lastIndexOf3 = wordArr[i2].form.lastIndexOf(33);
            if (lastIndexOf3 > lastIndexOf) {
                lastIndexOf = lastIndexOf3;
            }
            if (lastIndexOf != -1) {
                if (boundary(lastIndexOf, wordArr[i2], i2 + 1 < wordArr.length ? wordArr[i2 + 1] : null, i2 + 2 < wordArr.length ? wordArr[i2 + 2] : null)) {
                    this.sentences.add(new Sentence(str.substring(i, wordArr[i2].end + 1), this.currentOffset + i, this.currentOffset + wordArr[i2].end));
                    if (i2 + 1 < wordArr.length) {
                        i = wordArr[i2 + 1].start;
                    } else {
                        z = false;
                    }
                }
            }
        }
        if (z) {
            this.sentences.add(new Sentence(str.substring(i, wordArr[wordArr.length - 1].end + 1), this.currentOffset + i, this.currentOffset + wordArr[wordArr.length - 1].end));
        }
    }

    protected boolean boundary(int i, Word word, Word word2, Word word3) {
        char charAt = word.form.charAt(i);
        Word word4 = new Word(word.form.substring(0, i));
        Word word5 = new Word(word.form.substring(i + 1));
        Word word6 = new Word(word4.form);
        while (word6.form.length() > 0 && "\"'`{[(".indexOf(word6.form.charAt(0)) != -1) {
            word6.form = word6.form.substring(1);
        }
        if ("yahoo!".equalsIgnoreCase(new StringBuffer().append(word6.form).append(charAt).toString())) {
            return false;
        }
        if (charAt == '?' || charAt == '!') {
            return word2 == null || (word5.form.equals(WekaWrapper.defaultAttributeString) && (word2.capitalized || startsWithQuote(word2) || word2.form.equals(".") || (word3 != null && word3.capitalized && (word2.form.equals("--") || word2.form.equals("-RBR-"))))) || (isClose(word5) && hasStartMarker(word2));
        }
        if (word2 == null) {
            return true;
        }
        if (word5.form.equals(WekaWrapper.defaultAttributeString)) {
            if (startsWithQuote(word2) || startsWithOpenBracket(word2)) {
                return true;
            }
            if (word2.form.equals("-RBR-") && word3.form.equals("--")) {
                return false;
            }
            if (isClosingBracket(word2)) {
                return true;
            }
            if (word4.form.equals(WekaWrapper.defaultAttributeString) && word2.form.equals(".")) {
                return false;
            }
            if (word2.form.equals(".")) {
                return true;
            }
            if (word2.form.equals("--") && word3.capitalized && endsWithQuote(word4)) {
                return false;
            }
            if (word2.form.equals("--") && (word3.capitalized || startsWithQuote(word3))) {
                return true;
            }
            if (word2.capitalized || Character.isDigit(word2.form.charAt(0))) {
                return isTerminal(word6) || !(((word6.form.equals("p.m") || word6.form.equals("a.m")) && isTimeZone(word2)) || isHonorific(word6) || startsWithQuote(word4) || startsWithOpenBracket(word4) || (Pattern.compile("^([A-Z]\\.)*[A-Z]$").matcher(word4.form).find() && !sentenceBeginner(word2)));
            }
        }
        return isClose(word5) && hasStartMarker(word2) && !isHonorific(word6);
    }

    protected boolean sentenceBeginner(Word word) {
        return word.form.equals("The");
    }

    protected boolean startsWithQuote(Word word) {
        if (word.form.length() == 0) {
            return false;
        }
        return word.form.charAt(0) == '\'' || word.form.charAt(0) == '\"' || word.form.charAt(0) == '`';
    }

    protected boolean endsWithQuote(Word word) {
        return word.form.endsWith("'") || word.form.endsWith("''") || word.form.endsWith("'''") || word.form.endsWith("\"") || word.form.endsWith("'\"");
    }

    protected boolean isClose(Word word) {
        return isClosingBracket(word) || isClosingQuote(word);
    }

    protected boolean isClosingBracket(Word word) {
        return word.form.equals(")") || word.form.equals("}") || word.form.equals("]") || word.form.equals("-RBR-");
    }

    protected boolean isClosingQuote(Word word) {
        return word.form.equals("'") || word.form.equals("''") || word.form.equals("'''") || word.form.equals("\"") || word.form.equals("'\"");
    }

    protected boolean hasStartMarker(Word word) {
        return word.capitalized || startsWithOpenQuote(word) || startsWithOpenBracket(word);
    }

    protected boolean startsWithOpenQuote(Word word) {
        return word.form.startsWith("`") || word.form.startsWith("``") || word.form.startsWith("```") || word.form.startsWith("\"") || word.form.startsWith("\"`");
    }

    protected boolean startsWithOpenBracket(Word word) {
        return word.form.startsWith("(") || word.form.startsWith("{") || word.form.startsWith("[") || word.form.startsWith("-LBR-");
    }

    protected boolean isTimeZone(Word word) {
        return word.form.equals("AST") || word.form.equals("CST") || word.form.equals("EST") || word.form.equals("HST") || word.form.equals("MST") || word.form.equals("PST") || word.form.equals("ADT") || word.form.equals("CDT") || word.form.equals("EDT") || word.form.equals("HDT") || word.form.equals("MDT") || word.form.equals("PDT") || word.form.equals("UTC") || word.form.equals("UTC-11");
    }

    protected boolean isTerminal(Word word) {
        return word.form.equals("Esq") || word.form.equals("Jr") || word.form.equals("Sr") || word.form.equals("M.D") || word.form.equals("Ph.D");
    }

    protected boolean isHonorific(Word word) {
        return word.form.equals("APR") || word.form.equals("AUG") || word.form.equals("Adj") || word.form.equals("Adm") || word.form.equals("Adv") || word.form.equals("Apr") || word.form.equals("Asst") || word.form.equals("Aug") || word.form.equals("Bart") || word.form.equals("Bldg") || word.form.equals("Brig") || word.form.equals("Bros") || word.form.equals("Capt") || word.form.equals("Cmdr") || word.form.equals("Col") || word.form.equals("Comdr") || word.form.equals("Con") || word.form.equals("Cpl") || word.form.equals("DEC") || word.form.equals("DR") || word.form.equals("Dec") || word.form.equals("Dr") || word.form.equals("Ens") || word.form.equals("FEB") || word.form.equals("Feb") || word.form.equals("Gen") || word.form.equals("Gov") || word.form.equals("Hon") || word.form.equals("Hosp") || word.form.equals("Insp") || word.form.equals("JAN") || word.form.equals("JUL") || word.form.equals("JUN") || word.form.equals("Jan") || word.form.equals("Jul") || word.form.equals("Jun") || word.form.equals("Lt") || word.form.equals("MAR") || word.form.equals("MM") || word.form.equals("MR") || word.form.equals("MRS") || word.form.equals("MS") || word.form.equals("MT") || word.form.equals("Maj") || word.form.equals("Mar") || word.form.equals("Messrs") || word.form.equals("Mlle") || word.form.equals("Mme") || word.form.equals("Mr") || word.form.equals("Mrs") || word.form.equals("Ms") || word.form.equals("Msgr") || word.form.equals("Mt") || word.form.equals("NO") || word.form.equals("NOV") || word.form.equals("No") || word.form.equals("Nov") || word.form.equals("OCT") || word.form.equals("Oct") || word.form.equals("Op") || word.form.equals("Ord") || word.form.equals("Pfc") || word.form.equals("Ph") || word.form.equals("Prof") || word.form.equals("Pvt") || word.form.equals("Rep") || word.form.equals("Reps") || word.form.equals("Res") || word.form.equals("Rev") || word.form.equals("Rt") || word.form.equals("SEP") || word.form.equals("SEPT") || word.form.equals("ST") || word.form.equals("Sen") || word.form.equals("Sens") || word.form.equals("Sep") || word.form.equals("Sept") || word.form.equals("Sfc") || word.form.equals("Sgt") || word.form.equals("Sr") || word.form.equals("St") || word.form.equals("Supt") || word.form.equals("Surg") || word.form.equals("U.S") || word.form.equals("apr") || word.form.equals("aug") || word.form.equals("dec") || word.form.equals("feb") || word.form.equals("jan") || word.form.equals("jul") || word.form.equals("jun") || word.form.equals("nov") || word.form.equals("oct") || word.form.equals("sep") || word.form.equals("sept") || word.form.equals("v") || word.form.equals("vs");
    }
}
