package org.aksw.simba.topicmodeling.preprocessing.docsupplier.decorator.splitter;

import java.util.Iterator;
import java.util.List;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.util.Span;
import org.aksw.simba.topicmodeling.lang.Term;
import org.aksw.simba.topicmodeling.preprocessing.docsupplier.DocumentSupplier;
import org.aksw.simba.topicmodeling.utils.doc.Document;
import org.aksw.simba.topicmodeling.utils.doc.DocumentProperty;
import org.aksw.simba.topicmodeling.utils.doc.DocumentSentenceBoundary;
import org.aksw.simba.topicmodeling.utils.doc.DocumentText;
import org.aksw.simba.topicmodeling.utils.doc.TermTokenizedText;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/aksw/simba/topicmodeling/preprocessing/docsupplier/decorator/splitter/SentenceBasedDocumentTextSplitter.class */
public class SentenceBasedDocumentTextSplitter extends AbstractSplittingDocumentSupplierDecorator {
    private static final Logger LOGGER = LoggerFactory.getLogger(SentenceBasedDocumentTextSplitter.class);
    private SentenceDetectorME sentenceDetector;

    public SentenceBasedDocumentTextSplitter(DocumentSupplier documentSupplier, SentenceDetectorME sentenceDetectorME) {
        super(documentSupplier);
        this.sentenceDetector = sentenceDetectorME;
    }

    @Override // org.aksw.simba.topicmodeling.preprocessing.docsupplier.decorator.splitter.AbstractSplittingDocumentSupplierDecorator
    protected void splitDocument(Document document) {
        if (document == null) {
            return;
        }
        DocumentText property = document.getProperty(DocumentText.class);
        if (property == null) {
            LOGGER.info("Got a document without the needed DocumentText property. Ignoring this document.");
            return;
        }
        TermTokenizedText property2 = document.getProperty(TermTokenizedText.class);
        if (property2 == null) {
            splitDocument(document, property.getText());
        } else {
            splitDocument(document, property.getText(), property2.getTermTokenizedText());
        }
    }

    private void splitDocument(Document document, String str, List<Term> list) {
        Span[] sentPosDetect = this.sentenceDetector.sentPosDetect(str);
        if (sentPosDetect.length == 0) {
            return;
        }
        int i = 0;
        int i2 = 0;
        TermTokenizedText termTokenizedText = new TermTokenizedText();
        for (int i3 = 0; i3 < list.size(); i3++) {
            Term term = list.get(i3);
            int indexOf = str.indexOf(term.getWordForm(), i);
            if (indexOf >= 0) {
                while (indexOf > sentPosDetect[i2].getEnd()) {
                    this.queue.add(createDocument(str, sentPosDetect[i2].getStart(), sentPosDetect[i2].getEnd(), termTokenizedText, document));
                    i2++;
                    if (i2 >= sentPosDetect.length) {
                        LOGGER.warn("If have seen all sentences. But I still have unprocessed terms. They will be lost.");
                        return;
                    }
                    termTokenizedText = new TermTokenizedText();
                }
                termTokenizedText.addTerm(term);
                i = indexOf + term.getWordForm().length();
            } else {
                LOGGER.error("Couldn't find a term inside the given text. Ignoring this term.\nTerm = " + term.toString() + "\nremaining text = \"" + str.substring(i) + "\".");
            }
        }
        this.queue.add(createDocument(str, sentPosDetect[i2].getStart(), sentPosDetect[i2].getEnd(), termTokenizedText, document));
        while (true) {
            i2++;
            if (i2 >= sentPosDetect.length) {
                return;
            } else {
                this.queue.add(createDocument(str, sentPosDetect[i2].getStart(), sentPosDetect[i2].getEnd(), new TermTokenizedText(), document));
            }
        }
    }

    private void splitDocument(Document document, String str) {
        Span[] sentPosDetect = this.sentenceDetector.sentPosDetect(str);
        for (int i = 0; i < sentPosDetect.length; i++) {
            if (str.substring(sentPosDetect[i].getStart(), sentPosDetect[i].getEnd()).trim().length() > 0) {
                this.queue.add(createDocument(str, sentPosDetect[i].getStart(), sentPosDetect[i].getEnd(), null, document));
            }
        }
    }

    private Document createDocument(String str, int i, int i2, TermTokenizedText termTokenizedText, Document document) {
        Document document2 = new Document(getNextDocumentId());
        Iterator it = document.iterator();
        while (it.hasNext()) {
            document2.addProperty((DocumentProperty) it.next());
        }
        document2.addProperty(new DocumentText(str.substring(i, i2)));
        if (termTokenizedText != null) {
            document2.addProperty(termTokenizedText);
        }
        document2.addProperty(new DocumentSentenceBoundary(i, i2 - i));
        return document2;
    }
}
