package org.aksw.simba.topicmodeling.preprocessing.docsupplier.decorator;

import java.nio.charset.Charset;
import org.aksw.simba.topicmodeling.preprocessing.docsupplier.DocumentSupplier;
import org.aksw.simba.topicmodeling.utils.doc.Document;
import org.aksw.simba.topicmodeling.utils.doc.DocumentCharset;
import org.aksw.simba.topicmodeling.utils.doc.DocumentProperty;
import org.aksw.simba.topicmodeling.utils.doc.DocumentText;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/aksw/simba/topicmodeling/preprocessing/docsupplier/decorator/HtmlCharsetExtractingSupplierDecorator.class */
public class HtmlCharsetExtractingSupplierDecorator extends AbstractDocumentSupplierDecorator {
    private static final Logger LOGGER = LoggerFactory.getLogger(HtmlCharsetExtractingSupplierDecorator.class);
    private static final Charset DEFAULT_CHARSET = Charset.forName("UTF-8");

    /* loaded from: input_file:org/aksw/simba/topicmodeling/preprocessing/docsupplier/decorator/HtmlCharsetExtractingSupplierDecorator$StringWithCharset.class */
    public static class StringWithCharset {
        private String string;
        private Charset charset;

        public StringWithCharset(String str, Charset charset) {
            this.string = str;
            this.charset = charset;
        }

        public void setString(String str) {
            this.string = str;
        }

        public void changeCharset(Charset charset) {
            this.string = new String(this.string.getBytes(this.charset), charset);
            this.charset = charset;
        }

        public String getString() {
            return this.string;
        }

        public Charset getCharset() {
            return this.charset;
        }
    }

    public HtmlCharsetExtractingSupplierDecorator(DocumentSupplier documentSupplier) {
        super(new DocumentTextCreatingSupplierDecorator(documentSupplier, Charset.forName("ISO-8859-1")));
    }

    @Override // org.aksw.simba.topicmodeling.preprocessing.docsupplier.decorator.AbstractDocumentSupplierDecorator
    public Document prepareDocument(Document document) {
        DocumentText property = document.getProperty(DocumentText.class);
        if (property == null) {
            throw new IllegalArgumentException("Got a Document without a DocumentText property!");
        }
        DocumentProperty documentProperty = (DocumentCharset) document.getProperty(DocumentCharset.class);
        if (documentProperty == null) {
            documentProperty = new DocumentCharset(DEFAULT_CHARSET);
            document.addProperty(documentProperty);
        }
        StringWithCharset checkEncoding = checkEncoding(new StringWithCharset(property.getText(), documentProperty.getCharset()));
        if (!documentProperty.getCharset().equals(checkEncoding.getCharset())) {
            property.setText(checkEncoding.getString());
            document.addProperty(new DocumentCharset(checkEncoding.getCharset()));
        }
        return document;
    }

    private StringWithCharset checkEncoding(StringWithCharset stringWithCharset) {
        Charset extractCharset = extractCharset(stringWithCharset.string);
        if (extractCharset != null && !stringWithCharset.charset.equals(extractCharset)) {
            stringWithCharset.changeCharset(extractCharset);
        }
        return stringWithCharset;
    }

    private Charset extractCharset(String str) {
        String extractLowercasedHead = extractLowercasedHead(str);
        if (extractLowercasedHead == null) {
            LOGGER.warn("HTML page without <head>. Couldn't extract Charset.");
            return null;
        }
        int indexOf = extractLowercasedHead.indexOf("<meta ");
        while (indexOf > 0) {
            int indexOf2 = extractLowercasedHead.indexOf(">", indexOf);
            String extractCharsetFromMetaTag = extractCharsetFromMetaTag(extractLowercasedHead, indexOf, indexOf2);
            if (extractCharsetFromMetaTag != null) {
                try {
                    return Charset.forName(extractCharsetFromMetaTag);
                } catch (Exception e) {
                }
            }
            indexOf = extractLowercasedHead.indexOf("<meta ", indexOf2);
        }
        return null;
    }

    private String extractLowercasedHead(String str) {
        int indexOf = str.indexOf("<head");
        if (indexOf < 0) {
            str = str.toLowerCase();
            indexOf = str.indexOf("<head");
            if (indexOf < 0) {
                return null;
            }
        }
        int indexOf2 = str.indexOf("</head>", indexOf);
        if (indexOf2 < 0) {
            indexOf2 = str.indexOf("<body", indexOf);
            if (indexOf < 0) {
                return null;
            }
        }
        return str.substring(indexOf, indexOf2).toLowerCase();
    }

    private String extractCharsetFromMetaTag(String str, int i, int i2) {
        int indexOf;
        int indexOf2;
        int indexOf3 = str.indexOf("name", i);
        if (indexOf3 < 0 || indexOf3 > i2) {
            indexOf3 = str.indexOf("http-equiv", i);
        }
        if (indexOf3 < 0 || indexOf3 > i2) {
            int indexOf4 = str.indexOf("charset=", i);
            if (indexOf4 < 0 || indexOf4 > i2) {
                return null;
            }
            int i3 = indexOf4 + 8;
            char charAt = str.charAt(i3);
            if (charAt == '\"' || charAt == '\'') {
                i3++;
                indexOf = str.indexOf(charAt, i3);
            } else {
                indexOf = str.indexOf(32, i3);
                if (indexOf < 0 || indexOf > i2) {
                    indexOf = str.indexOf(47, i3);
                    if (indexOf < 0 || indexOf > i2) {
                        indexOf = str.indexOf(62, i3);
                    }
                }
            }
            if (indexOf < 0 || indexOf > i2) {
                return null;
            }
            return str.substring(i3, indexOf);
        }
        int indexOf5 = str.indexOf(61, indexOf3);
        if (indexOf5 < 0 || indexOf5 > i2) {
            return null;
        }
        int i4 = indexOf5 + 1;
        char charAt2 = str.charAt(i4);
        if (charAt2 == '\"' || charAt2 == '\'') {
            i4++;
            indexOf2 = str.indexOf(charAt2, i4);
        } else {
            indexOf2 = str.indexOf(32, i4);
            if (indexOf2 < 0 || indexOf2 > i2) {
                indexOf2 = str.indexOf(47, i4);
                if (indexOf2 < 0 || indexOf2 > i2) {
                    indexOf2 = str.indexOf(62, i4);
                }
            }
        }
        if (indexOf2 < 0 || indexOf2 > i2 || !str.substring(i4, indexOf2).toLowerCase().equals("content-type")) {
            return null;
        }
        int indexOf6 = str.indexOf("charset=", i);
        if (indexOf6 < 0 || indexOf6 > i2) {
            return null;
        }
        int i5 = indexOf6 + 8;
        int indexOf7 = str.indexOf(59, i5);
        if (indexOf7 < 0 || indexOf7 > i2) {
            indexOf7 = str.indexOf(34, i5);
            if (indexOf7 < 0 || indexOf7 > i2) {
                indexOf7 = str.indexOf(39, i5);
                if (indexOf7 < 0 || indexOf7 > i2) {
                    indexOf7 = str.indexOf(47, i5);
                    if (indexOf7 < 0 || indexOf7 > i2) {
                        indexOf7 = str.indexOf(62, i5);
                    }
                }
            }
        }
        if (indexOf7 < 0 || indexOf7 > i2) {
            return null;
        }
        return str.substring(i5, indexOf7);
    }
}
