package org.dice_research.topicmodeling.preprocessing.docsupplier.decorator;

import org.apache.commons.lang3.StringEscapeUtils;
import org.dice_research.topicmodeling.preprocessing.docsupplier.DocumentSupplier;
import org.dice_research.topicmodeling.utils.doc.Document;
import org.dice_research.topicmodeling.utils.doc.DocumentText;

/* loaded from: input_file:org/dice_research/topicmodeling/preprocessing/docsupplier/decorator/SimpleHtmlCleaner.class */
public class SimpleHtmlCleaner extends AbstractDocumentSupplierDecorator {

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/dice_research/topicmodeling/preprocessing/docsupplier/decorator/SimpleHtmlCleaner$States.class */
    public enum States {
        NORMAL_TEXT,
        TAG_STARTED,
        ENCODED_CHAR_STARTED
    }

    public SimpleHtmlCleaner(DocumentSupplier documentSupplier) {
        super(documentSupplier);
    }

    @Override // org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.AbstractDocumentSupplierDecorator
    public Document prepareDocument(Document document) {
        DocumentText property = document.getProperty(DocumentText.class);
        if (property == null) {
            throw new IllegalArgumentException("Got a Document without a DocumentText property!");
        }
        property.setText(clearText(property.getText()));
        return document;
    }

    public String clearText(String str) {
        States states = States.NORMAL_TEXT;
        int i = 0;
        int i2 = 0;
        StringBuilder sb = new StringBuilder();
        char[] charArray = str.toCharArray();
        for (int i3 = 0; i3 < str.length(); i3++) {
            char c = charArray[i3];
            switch (states) {
                case NORMAL_TEXT:
                    switch (c) {
                        case '&':
                            states = States.ENCODED_CHAR_STARTED;
                            i = i3;
                            break;
                        case '<':
                            states = States.TAG_STARTED;
                            i = i3;
                            break;
                    }
                case TAG_STARTED:
                    if (c == '>') {
                        sb.append(str.substring(i2, i));
                        i2 = i3 + 1;
                        states = States.NORMAL_TEXT;
                        break;
                    } else {
                        break;
                    }
                case ENCODED_CHAR_STARTED:
                    int i4 = i - i3;
                    if (i4 > 7) {
                        states = States.NORMAL_TEXT;
                        break;
                    } else {
                        switch (c) {
                            case '#':
                                if (i4 > 1) {
                                    states = States.NORMAL_TEXT;
                                    break;
                                } else {
                                    break;
                                }
                            case '$':
                            case '%':
                            case '&':
                            case '\'':
                            case '(':
                            case ')':
                            case '*':
                            case '+':
                            case ',':
                            case '-':
                            case '.':
                            case '/':
                            case ':':
                            case '=':
                            case '>':
                            case '?':
                            case '@':
                            case DocumentTextWithTermInfoCreatingSupplierDecorator.TERM_START_CHAR /* 91 */:
                            case DocumentTextWithTermInfoCreatingSupplierDecorator.ESCAPE_CHAR /* 92 */:
                            case DocumentTextWithTermInfoCreatingSupplierDecorator.TERM_END_CHAR /* 93 */:
                            case '^':
                            case '_':
                            case '`':
                            default:
                                states = States.NORMAL_TEXT;
                                break;
                            case '0':
                            case '1':
                            case '2':
                            case '3':
                            case '4':
                            case '5':
                            case '6':
                            case '7':
                            case '8':
                            case '9':
                            case 'A':
                            case 'B':
                            case 'C':
                            case 'D':
                            case 'E':
                            case 'F':
                            case 'G':
                            case 'H':
                            case 'I':
                            case 'J':
                            case 'K':
                            case 'L':
                            case 'M':
                            case 'N':
                            case 'O':
                            case 'P':
                            case 'Q':
                            case 'R':
                            case 'S':
                            case 'T':
                            case 'U':
                            case 'V':
                            case 'W':
                            case 'X':
                            case 'Y':
                            case 'Z':
                            case 'a':
                            case 'b':
                            case 'c':
                            case 'd':
                            case 'e':
                            case 'f':
                            case 'g':
                            case 'h':
                            case 'i':
                            case 'j':
                            case 'k':
                            case 'l':
                            case 'm':
                            case 'n':
                            case 'o':
                            case 'p':
                            case 'q':
                            case 'r':
                            case 's':
                            case 't':
                            case 'u':
                            case 'v':
                            case 'w':
                            case 'x':
                            case 'y':
                            case 'z':
                                break;
                            case ';':
                                sb.append(str.substring(i2, i));
                                i2 = i3 + 1;
                                sb.append(StringEscapeUtils.unescapeHtml4(str.substring(i, i2)));
                                states = States.NORMAL_TEXT;
                                break;
                            case '<':
                                states = States.TAG_STARTED;
                                i = i3;
                                break;
                        }
                    }
            }
        }
        sb.append(str.substring(i2));
        return sb.toString();
    }
}
