package org.dice_research.topicmodeling.wikipedia;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import org.apache.commons.io.Charsets;
import org.dice_research.topicmodeling.io.xml.XMLParserObserver;
import org.dice_research.topicmodeling.io.xml.stream.SimpleReaderBasedXMLParser;
import org.dice_research.topicmodeling.preprocessing.docsupplier.AbstractDocumentSupplier;
import org.dice_research.topicmodeling.utils.doc.Document;
import org.dice_research.topicmodeling.utils.doc.DocumentName;
import org.dice_research.topicmodeling.utils.doc.DocumentText;
import org.dice_research.topicmodeling.wikipedia.doc.WikipediaArticleId;
import org.dice_research.topicmodeling.wikipedia.doc.WikipediaNamespace;
import org.dice_research.topicmodeling.wikipedia.doc.WikipediaRedirect;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/dice_research/topicmodeling/wikipedia/WikipediaDumpReader.class */
public class WikipediaDumpReader extends AbstractDocumentSupplier implements XMLParserObserver {
    private static final Logger LOGGER = LoggerFactory.getLogger(WikipediaDumpReader.class);
    private static final String DOCUMENT_XML_TAG_NAME = "page";
    private static final String TITLE_XML_TAG_NAME = "title";
    private static final String TEXT_XML_TAG_NAME = "text";
    private static final String ARTICLE_ID_XML_TAG_NAME = "id";
    private static final String REVISION_XML_TAG_NAME = "revision";
    private static final String REDIRECT_XML_TAG_NAME = "redirect";
    private static final String NAMESPACE_XML_TAG_NAME = "ns";
    private Reader dumpReader;
    private SimpleReaderBasedXMLParser xmlParser;
    private String data;
    private Document document;
    private String lastIdContainingTag;

    public static WikipediaDumpReader createReader(String str) throws FileNotFoundException {
        return createReader(new File(str));
    }

    public static WikipediaDumpReader createReader(String str, Charset charset) throws FileNotFoundException {
        return createReader(new File(str), charset);
    }

    public static WikipediaDumpReader createReader(File file) throws FileNotFoundException {
        return createReader(file, Charsets.UTF_8);
    }

    public static WikipediaDumpReader createReader(File file, Charset charset) throws FileNotFoundException {
        return createReader(new FileInputStream(file), charset);
    }

    public static WikipediaDumpReader createReader(InputStream inputStream, Charset charset) {
        return new WikipediaDumpReader(new InputStreamReader(inputStream, charset));
    }

    private WikipediaDumpReader(Reader reader) {
        this.dumpReader = reader;
        this.xmlParser = new SimpleReaderBasedXMLParser(reader, this);
    }

    public Document getNextDocument() {
        if (this.dumpReader == null) {
            return null;
        }
        this.xmlParser.parse();
        if (this.document == null) {
            try {
                this.dumpReader.close();
            } catch (IOException e) {
                LOGGER.error("Error while closing the file reader used for reading the wikipedia dump file.", e);
            }
            this.dumpReader = null;
            return null;
        }
        Document document = this.document;
        this.document = null;
        if (LOGGER.isInfoEnabled() && document.getDocumentId() % 1000 == 999) {
            LOGGER.info("Read the " + (document.getDocumentId() + 1) + "th document from the dump.");
        }
        return document;
    }

    public void handleOpeningTag(String str) {
        if (str.startsWith(DOCUMENT_XML_TAG_NAME)) {
            this.document = new Document(getNextDocumentId());
            this.lastIdContainingTag = DOCUMENT_XML_TAG_NAME;
        } else if (str.startsWith(REVISION_XML_TAG_NAME)) {
            this.lastIdContainingTag = REVISION_XML_TAG_NAME;
        }
    }

    public void handleClosingTag(String str) {
        if (str.startsWith(DOCUMENT_XML_TAG_NAME)) {
            this.xmlParser.stop();
            return;
        }
        if (str.startsWith(TITLE_XML_TAG_NAME)) {
            if (this.document != null) {
                this.document.addProperty(new DocumentName(this.data));
                return;
            } else {
                LOGGER.error("Found a title tag while there is no document object. Ignoring this title.");
                return;
            }
        }
        if (str.startsWith(TEXT_XML_TAG_NAME)) {
            if (this.document != null) {
                this.document.addProperty(new DocumentText(this.data));
                return;
            } else {
                LOGGER.error("Found a text tag while there is no document object. Ignoring this text.");
                return;
            }
        }
        if (str.startsWith(ARTICLE_ID_XML_TAG_NAME) && this.lastIdContainingTag == DOCUMENT_XML_TAG_NAME) {
            if (this.document == null) {
                LOGGER.error("Found an article id tag while there is no document object. Ignoring this id.");
                return;
            }
            try {
                this.document.addProperty(new WikipediaArticleId(Integer.parseInt(this.data)));
                return;
            } catch (NumberFormatException e) {
                LOGGER.error("Found an article id tag but couldn't parse the id. Ignoring this id.", e);
                return;
            }
        }
        if (str.startsWith(NAMESPACE_XML_TAG_NAME)) {
            if (this.document == null) {
                LOGGER.error("Found a namespace tag while there is no document object. Ignoring this namespace.");
                return;
            }
            try {
                this.document.addProperty(new WikipediaNamespace(Integer.parseInt(this.data)));
            } catch (NumberFormatException e2) {
                LOGGER.error("Found a namespace tag but couldn't parse the id. Ignoring this namespace.", e2);
            }
        }
    }

    public void handleData(String str) {
        this.data = str;
    }

    public void handleEmptyTag(String str) {
        int i;
        int indexOf;
        if (str.startsWith(REDIRECT_XML_TAG_NAME)) {
            if (this.document == null) {
                LOGGER.error("Found a redirect tag while there is no document object. Ignoring this redirect.");
                return;
            }
            String str2 = null;
            int indexOf2 = str.indexOf("title=\"");
            if (indexOf2 > 0 && (indexOf = str.indexOf(34, (i = indexOf2 + 7))) > i) {
                str2 = str.substring(i, indexOf);
            }
            if (str2 == null) {
                LOGGER.warn("Found a redirect tag but couldn't parse the title (tag=\"" + str + "\"). Adding an empty title instead.");
                str2 = "";
            }
            this.document.addProperty(new WikipediaRedirect(str2));
        }
    }
}
