package org.dice_research.topicmodeling.preprocessing.docsupplier.decorator;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringEscapeUtils;
import org.dice_research.topicmodeling.preprocessing.docsupplier.DocumentSupplier;
import org.dice_research.topicmodeling.utils.doc.Document;
import org.dice_research.topicmodeling.utils.doc.DocumentText;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Deprecated
/* loaded from: input_file:org/dice_research/topicmodeling/preprocessing/docsupplier/decorator/WikipediaMarkupDeletingDecorator.class */
public class WikipediaMarkupDeletingDecorator extends AbstractDocumentSupplierDecorator {
    private static final Logger LOGGER = LoggerFactory.getLogger(WikipediaMarkupDeletingDecorator.class);

    public WikipediaMarkupDeletingDecorator(DocumentSupplier documentSupplier) {
        super(documentSupplier);
    }

    @Override // org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.AbstractDocumentSupplierDecorator
    public Document prepareDocument(Document document) {
        DocumentText property = document.getProperty(DocumentText.class);
        if (property == null) {
            LOGGER.error("Got a Document without a DocumentText property!");
            return null;
        }
        property.setText(removeWikiMarkup(property.getText()));
        return document;
    }

    private String removeWikiMarkup(String str) {
        String str2 = str;
        if (str2.startsWith("#")) {
            String lowerCase = str2.substring(0, str2.length() < 14 ? str2.length() : 14).toLowerCase();
            if (lowerCase.startsWith("#redirect")) {
                str2 = str2.substring(9);
            } else if (lowerCase.startsWith("#weiterleitung")) {
                str2 = str2.substring(14);
            }
        }
        return unescapeSymbols(cleanTag("\\n\\|", cleanTag("\\n\\|-[^\\n]*\\n", cleanTag("\\n\\|\\}", cleanTag("\\n\\![^\\|\\n]*", cleanTag("\\n\\![^\\|\\n]*\\|", cleanTag("\\n\\{\\|[^\\n]*\\n", cleanTag("----", cleanTag("\\n[*#;:]+", cleanTagRetainContent("([']{2,5})", "([']{2,5})", cleanTagRetainContent("([=]{2,6})", "([=]{2,6})", cleanTagRetainContent("<[^>]*>", "</[^>]*>", cleanTagDeleteContent("<!--", "-->", cleanTagRetainContent("\\[", "\\]", cleanTagRetainContent("\\[[^\\]]* ", "\\]", cleanTagRetainContent("\\[\\[", "\\]\\]", cleanTagRetainContent("\\[\\[[^\\]]*\\|", "\\]\\]", cleanTagDeleteContent("\\{\\{", "\\}\\}", str2))))), " "), " "))), "\n")), "\n"), "\n"), "\n"), "\n"), "\n"), "\n"));
    }

    private static String cleanTagDeleteContent(String str, String str2, String str3) {
        return cleanTagDeleteContent(str, str2, str3, null);
    }

    private static String cleanTagDeleteContent(String str, String str2, String str3, String str4) {
        Matcher matcher = Pattern.compile("(" + str + ")|(" + str2 + ")").matcher(str3);
        String str5 = "";
        String str6 = str4 == null ? "" : str4;
        int i = 0;
        int i2 = 0;
        while (matcher.find()) {
            if (matcher.group().matches(str)) {
                if (i == 0) {
                    str5 = str5 + str3.substring(i2, matcher.start()) + str6;
                }
                i++;
            }
            if (matcher.group().matches(str2)) {
                i--;
                if (i == 0) {
                    i2 = matcher.end();
                }
            }
        }
        if (matcher.hitEnd()) {
            str5 = str5 + str3.substring(i2, str3.length());
        }
        return str5;
    }

    private static String cleanTag(String str, String str2) {
        return cleanTagRetainContent(str, null, str2);
    }

    private static String cleanTag(String str, String str2, String str3) {
        return cleanTagRetainContent(str, null, str2, str3);
    }

    private static String cleanTagRetainContent(String str, String str2, String str3) {
        return cleanTagRetainContent(str, str2, str3, null);
    }

    private static String cleanTagRetainContent(String str, String str2, String str3, String str4) {
        String str5 = str4 == null ? "" : str4;
        String str6 = str3;
        if (str != null) {
            str6 = str6.replaceAll(str, str5);
        }
        if (str2 != null) {
            str6 = str6.replaceAll(str2, str5);
        }
        return str6;
    }

    private static String unescapeSymbols(String str) {
        Matcher matcher = Pattern.compile("(&[#\\p{Alnum}][\\p{Alnum}]*;)").matcher(str);
        StringBuilder sb = new StringBuilder();
        int i = 0;
        while (true) {
            int i2 = i;
            if (!matcher.find()) {
                sb.append(str.substring(i2));
                return sb.toString();
            }
            sb.append(str.substring(i2, matcher.start()));
            sb.append(StringEscapeUtils.unescapeHtml4(str.substring(matcher.start(), matcher.end())));
            i = matcher.end();
        }
    }
}
