package org.aksw.simba.topicmodeling.io.xml;

import java.util.ArrayList;
import java.util.List;
import org.aksw.simba.topicmodeling.utils.doc.Document;
import org.aksw.simba.topicmodeling.utils.doc.DocumentMultipleCategories;
import org.aksw.simba.topicmodeling.utils.doc.DocumentText;
import org.aksw.simba.topicmodeling.utils.doc.ParseableDocumentProperty;
import org.aksw.simba.topicmodeling.utils.doc.ner.NamedEntitiesInText;
import org.aksw.simba.topicmodeling.utils.doc.ner.NamedEntityInText;
import org.aksw.simba.topicmodeling.utils.doc.ner.SignedNamedEntityInText;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/aksw/simba/topicmodeling/io/xml/AbstractDocumentXmlReader.class */
public abstract class AbstractDocumentXmlReader implements XMLParserObserver {
    private static final Logger LOGGER = LoggerFactory.getLogger(AbstractDocumentXmlReader.class);
    private Document currentDocument;
    private NamedEntityInText currentNamedEntity;
    private List<NamedEntityInText> namedEntities = new ArrayList();
    private List<String> categories = new ArrayList();
    private StringBuilder textBuffer = new StringBuilder();
    private String data;

    @Override // org.aksw.simba.topicmodeling.io.xml.XMLParserObserver
    public void handleOpeningTag(String str) {
        this.data = "";
        int indexOf = str.indexOf(32);
        String substring = indexOf == -1 ? str : str.substring(0, indexOf);
        if (!substring.equals(CorpusXmlTagHelper.DOCUMENT_TAG_NAME)) {
            if (substring.equals(CorpusXmlTagHelper.NAMED_ENTITY_IN_TEXT_TAG_NAME) || substring.equals(CorpusXmlTagHelper.SIGNED_NAMED_ENTITY_IN_TEXT_TAG_NAME)) {
                this.currentNamedEntity = parseNamedEntityInText(str);
                this.currentNamedEntity.setStartPos(this.textBuffer.length());
                return;
            }
            return;
        }
        this.currentDocument = new Document();
        int indexOf2 = str.indexOf(" id=\"");
        if (indexOf2 > 0) {
            int i = indexOf2 + 5;
            int indexOf3 = str.indexOf(34, i + 1);
            if (indexOf3 <= 0) {
                LOGGER.warn("Found a document tag without a document id attribute.");
                return;
            }
            try {
                this.currentDocument.setDocumentId(Integer.parseInt(str.substring(i, indexOf3)));
            } catch (NumberFormatException e) {
                LOGGER.warn("Coudln't parse the document id from the document tag.", e);
            }
        }
    }

    @Override // org.aksw.simba.topicmodeling.io.xml.XMLParserObserver
    public void handleClosingTag(String str) {
        ParseableDocumentProperty newInstance;
        if (str.equals(CorpusXmlTagHelper.DOCUMENT_TAG_NAME)) {
            finishedDocument(this.currentDocument);
            this.currentDocument = null;
            return;
        }
        if (str.equals(CorpusXmlTagHelper.TEXT_WITH_NAMED_ENTITIES_TAG_NAME) && this.currentDocument != null) {
            this.currentDocument.addProperty(new DocumentText(this.textBuffer.toString()));
            this.textBuffer.delete(0, this.textBuffer.length());
            this.currentDocument.addProperty(new NamedEntitiesInText(this.namedEntities));
            this.namedEntities.clear();
            return;
        }
        if (str.equals(CorpusXmlTagHelper.TEXT_PART_TAG_NAME)) {
            this.textBuffer.append(this.data);
            this.data = "";
            return;
        }
        if (str.equals(CorpusXmlTagHelper.NAMED_ENTITY_IN_TEXT_TAG_NAME) || str.equals(CorpusXmlTagHelper.SIGNED_NAMED_ENTITY_IN_TEXT_TAG_NAME)) {
            if (this.currentNamedEntity != null) {
                this.currentNamedEntity.setLength(this.data.length());
                this.namedEntities.add(this.currentNamedEntity);
                this.textBuffer.append(this.data);
                this.currentNamedEntity = null;
                this.data = "";
                return;
            }
            return;
        }
        if (str.equals(CorpusXmlTagHelper.DOCUMENT_CATEGORIES_TAG_NAME)) {
            this.currentDocument.addProperty(new DocumentMultipleCategories((String[]) this.categories.toArray(new String[this.categories.size()])));
            this.categories.clear();
            return;
        }
        if (str.equals(CorpusXmlTagHelper.DOCUMENT_CATEGORIES_SINGLE_CATEGORY_TAG_NAME)) {
            this.categories.add(this.data);
            this.data = "";
            return;
        }
        if (this.currentDocument != null) {
            Class<? extends ParseableDocumentProperty> parseableDocumentPropertyClassForTagName = CorpusXmlTagHelper.getParseableDocumentPropertyClassForTagName(str);
            try {
                if (parseableDocumentPropertyClassForTagName != null) {
                    try {
                        newInstance = parseableDocumentPropertyClassForTagName.getConstructor(String.class).newInstance(this.data);
                    } catch (NoSuchMethodException e) {
                        newInstance = parseableDocumentPropertyClassForTagName.getConstructor(new Class[0]).newInstance(new Object[0]);
                        newInstance.parseValue(this.data);
                    }
                    this.currentDocument.addProperty(newInstance);
                }
            } catch (Exception e2) {
                LOGGER.error("Couldn't parse property " + parseableDocumentPropertyClassForTagName + " from the String \"" + this.data + "\".", e2);
            }
            this.data = "";
        }
    }

    @Override // org.aksw.simba.topicmodeling.io.xml.XMLParserObserver
    public void handleData(String str) {
        this.data = str;
    }

    @Override // org.aksw.simba.topicmodeling.io.xml.XMLParserObserver
    public void handleEmptyTag(String str) {
    }

    protected NamedEntityInText parseNamedEntityInText(String str) {
        String str2 = null;
        String str3 = null;
        try {
            int indexOf = str.indexOf(32) + 1;
            int indexOf2 = str.indexOf(61, indexOf);
            while (indexOf2 > 0) {
                String trim = str.substring(indexOf, indexOf2).trim();
                int indexOf3 = str.indexOf(34, indexOf2);
                int indexOf4 = str.indexOf(34, indexOf3 + 1);
                String substring = str.substring(indexOf3 + 1, indexOf4);
                if (trim.equals(CorpusXmlTagHelper.URI_ATTRIBUTE_NAME)) {
                    str2 = substring;
                } else if (trim.equals(CorpusXmlTagHelper.SOURCE_ATTRIBUTE_NAME)) {
                    str3 = substring;
                }
                indexOf = indexOf4 + 1;
                indexOf2 = str.indexOf(61, indexOf);
            }
            return str3 != null ? new SignedNamedEntityInText(-1, -1, str2, str3) : new NamedEntityInText(-1, -1, str2);
        } catch (Exception e) {
            LOGGER.error("Couldn't parse NamedEntityInText tag (" + str + "). Returning null.", e);
            return null;
        }
    }

    public static void registerParseableDocumentProperty(Class<? extends ParseableDocumentProperty> cls) {
        CorpusXmlTagHelper.registerParseableDocumentProperty(cls);
    }

    protected abstract void finishedDocument(Document document);
}
