package org.dice_research.topicmodeling.preprocessing.docsupplier.decorator;

import dk.brics.automaton.RegExp;
import dk.brics.automaton.RunAutomaton;
import org.apache.commons.lang3.StringEscapeUtils;
import org.dice_research.topicmodeling.preprocessing.docsupplier.DocumentSupplier;
import org.dice_research.topicmodeling.utils.doc.Document;
import org.dice_research.topicmodeling.utils.doc.DocumentText;

/* loaded from: input_file:org/dice_research/topicmodeling/preprocessing/docsupplier/decorator/NewsDeMarkupRemovingSupplierDecorator.class */
public class NewsDeMarkupRemovingSupplierDecorator extends AbstractDocumentSupplierDecorator {
    private RunAutomaton tagAutomaton;
    private RunAutomaton charAutomaton;
    private RunAutomaton tooltipAutomaton;

    public NewsDeMarkupRemovingSupplierDecorator(DocumentSupplier documentSupplier) {
        super(documentSupplier);
        this.tagAutomaton = new RunAutomaton(new RegExp("\\<[^\\<\\>]*\\>").toAutomaton());
        this.charAutomaton = new RunAutomaton(new RegExp("\\&[#A-Za-z][A-Za-z]{1,6};").toAutomaton());
        this.tooltipAutomaton = new RunAutomaton(new RegExp("\\[[^\\[\\]]*\\]").toAutomaton());
    }

    @Override // org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.AbstractDocumentSupplierDecorator
    public Document prepareDocument(Document document) {
        DocumentText property = document.getProperty(DocumentText.class);
        if (property == null) {
            throw new IllegalArgumentException("Got a Document without a DocumentText property!");
        }
        property.setText(cleanText(property.getText()));
        return document;
    }

    public String cleanText(String str) {
        int i = 0;
        int i2 = 0;
        StringBuilder sb = new StringBuilder();
        StringBuilder sb2 = new StringBuilder();
        char[] charArray = str.toCharArray();
        while (i < charArray.length) {
            switch (charArray[i]) {
                case '&':
                    int run = this.charAutomaton.run(str, i);
                    if (run <= 0) {
                        i++;
                        break;
                    } else {
                        sb.append(str.substring(i2, i));
                        handleHtmlEncodedChar(sb, str, i, run);
                        i += run;
                        i2 = i;
                        break;
                    }
                case '<':
                    int run2 = this.tagAutomaton.run(str, i);
                    if (run2 <= 0) {
                        i++;
                        break;
                    } else {
                        sb.append(str.substring(i2, i));
                        handleHtmlTag(sb, str, i, run2);
                        i += run2;
                        i2 = i;
                        break;
                    }
                case DocumentTextWithTermInfoCreatingSupplierDecorator.TERM_START_CHAR /* 91 */:
                    int run3 = this.tooltipAutomaton.run(str, i);
                    if (run3 <= 0) {
                        i++;
                        break;
                    } else {
                        sb.append(str.substring(i2, i));
                        handleToolTip(sb, sb2, str, i, run3);
                        i += run3;
                        i2 = i;
                        break;
                    }
                default:
                    i++;
                    break;
            }
        }
        sb.append(str.substring(i2));
        sb.append((CharSequence) sb2);
        return sb.toString();
    }

    private void handleHtmlTag(StringBuilder sb, String str, int i, int i2) {
        String substring = str.substring(i + 1, i + i2);
        if (substring.startsWith("p") || substring.startsWith("/p") || substring.startsWith("tr") || substring.startsWith("/tr")) {
            sb.append('\n');
        } else {
            sb.append(' ');
        }
    }

    private void handleHtmlEncodedChar(StringBuilder sb, String str, int i, int i2) {
        sb.append(StringEscapeUtils.unescapeHtml4(str.substring(i, i + i2)));
    }

    private void handleToolTip(StringBuilder sb, StringBuilder sb2, String str, int i, int i2) {
        if (str.charAt(i + 1) == 't' && str.charAt(i + 2) == 't' && str.charAt(i + 3) == '=') {
            String substring = str.substring(i + 4, (i + i2) - 1);
            sb2.append('\n');
            sb2.append(substring);
        }
        sb.append(' ');
    }
}
