package org.dbpedia.extraction.sources;

import java.io.IOException;
import java.io.InputStream;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.dbpedia.extraction.util.Language;
import org.dbpedia.extraction.wikiparser.WikiTitle;
import org.dbpedia.util.Exceptions;
import org.dbpedia.util.text.xml.XMLStreamUtils;
import scala.Function1;
import scala.Option;
import scala.util.control.ControlThrowable;

/* loaded from: input_file:org/dbpedia/extraction/sources/WikipediaDumpParser.class */
public class WikipediaDumpParser {
    private static final Logger logger = Logger.getLogger(WikipediaDumpParser.class.getName());
    private static final String MEDIAWIKI_NS = null;
    private static final String ROOT_ELEM = "mediawiki";
    private static final String SITEINFO_ELEM = "siteinfo";
    private static final String PAGE_ELEM = "page";
    private static final String TITLE_ELEM = "title";
    private static final String REDIRECT_ELEM = "redirect";
    private static final String ID_ELEM = "id";
    private static final String REVISION_ELEM = "revision";
    private static final String TEXT_ELEM = "text";
    private final InputStream _stream;
    private final Function1<WikiPage, ?> _processor;
    private final Function1<WikiTitle, Boolean> _filter;
    private XMLStreamReader _reader;

    public WikipediaDumpParser(InputStream inputStream, Function1<WikiPage, ?> function1, Function1<WikiTitle, Boolean> function12) {
        if (inputStream == null) {
            throw new NullPointerException("file");
        }
        if (function1 == null) {
            throw new NullPointerException("processor");
        }
        this._stream = inputStream;
        this._processor = function1;
        this._filter = function12;
    }

    public void run() throws IOException, XMLStreamException, InterruptedException {
        this._reader = XMLInputFactory.newInstance().createXMLStreamReader(this._stream, "UTF-8");
        try {
            readDump();
            this._reader.close();
            this._reader = null;
        } catch (Throwable th) {
            this._reader.close();
            this._reader = null;
            throw th;
        }
    }

    private void readDump() throws XMLStreamException, InterruptedException {
        this._reader.nextTag();
        XMLStreamUtils.requireStartElement(this._reader, MEDIAWIKI_NS, ROOT_ELEM);
        this._reader.nextTag();
        readPages(readSiteInfo());
        XMLStreamUtils.requireEndElement(this._reader, MEDIAWIKI_NS, ROOT_ELEM);
    }

    private Language readSiteInfo() throws XMLStreamException {
        XMLStreamUtils.requireStartElement(this._reader, MEDIAWIKI_NS, SITEINFO_ELEM);
        this._reader.nextTag();
        XMLStreamUtils.skipElement(this._reader);
        this._reader.nextTag();
        String elementText = this._reader.getElementText();
        this._reader.nextTag();
        String substring = elementText.substring(elementText.indexOf(47) + 2, elementText.indexOf(46));
        Option<Language> fromWikiCode = substring.toLowerCase().equals("commons") ? Language.fromWikiCode("en") : Language.fromWikiCode(substring);
        if (fromWikiCode.isEmpty()) {
            throw new XMLStreamException("Invalid wiki language code: '" + substring + "'");
        }
        Language language = (Language) fromWikiCode.get();
        XMLStreamUtils.skipElement(this._reader);
        this._reader.nextTag();
        XMLStreamUtils.skipElement(this._reader);
        this._reader.nextTag();
        XMLStreamUtils.skipElement(this._reader);
        this._reader.nextTag();
        XMLStreamUtils.requireEndElement(this._reader, MEDIAWIKI_NS, SITEINFO_ELEM);
        this._reader.nextTag();
        return language;
    }

    private void readPages(Language language) throws XMLStreamException, InterruptedException {
        while (XMLStreamUtils.isStartElement(this._reader, MEDIAWIKI_NS, PAGE_ELEM)) {
            readPage(language);
            this._reader.nextTag();
        }
    }

    private void readPage(Language language) throws XMLStreamException {
        this._reader.nextTag();
        XMLStreamUtils.requireStartElement(this._reader, MEDIAWIKI_NS, TITLE_ELEM);
        String elementText = this._reader.getElementText();
        this._reader.nextTag();
        WikiTitle wikiTitle = null;
        try {
            wikiTitle = WikiTitle.parse(elementText, language);
        } catch (Exception e) {
            logger.log(Level.WARNING, "Error parsing title: " + elementText, (Throwable) e);
        }
        if (wikiTitle != null && ((Boolean) this._filter.apply(wikiTitle)).booleanValue()) {
            long parseLong = Long.parseLong(this._reader.getElementText());
            WikiPage wikiPage = null;
            while (this._reader.nextTag() == 1) {
                if (XMLStreamUtils.isStartElement(this._reader, MEDIAWIKI_NS, REVISION_ELEM)) {
                    wikiPage = readRevision(wikiTitle, parseLong);
                } else {
                    XMLStreamUtils.skipElement(this._reader);
                }
            }
            if (wikiPage != null) {
                try {
                    this._processor.apply(wikiPage);
                    return;
                } catch (Exception e2) {
                    if (e2 instanceof ControlThrowable) {
                        throw Exceptions.unchecked(e2);
                    }
                    if (e2 instanceof InterruptedException) {
                        throw Exceptions.unchecked(e2);
                    }
                    logger.log(Level.WARNING, "Error processing page  " + wikiTitle, (Throwable) e2);
                    return;
                }
            }
            return;
        }
        while (true) {
            if (this._reader.getEventType() == 2 && PAGE_ELEM.equals(this._reader.getLocalName())) {
                return;
            } else {
                this._reader.next();
            }
        }
    }

    private WikiPage readRevision(WikiTitle wikiTitle, long j) throws XMLStreamException {
        String str = null;
        long j2 = -1;
        boolean z = false;
        while (this._reader.nextTag() == 1) {
            if (XMLStreamUtils.isStartElement(this._reader, MEDIAWIKI_NS, TEXT_ELEM)) {
                str = this._reader.getElementText();
            } else if (XMLStreamUtils.isStartElement(this._reader, MEDIAWIKI_NS, REDIRECT_ELEM)) {
                z = true;
                XMLStreamUtils.skipElement(this._reader);
            } else if (XMLStreamUtils.isStartElement(this._reader, MEDIAWIKI_NS, ID_ELEM)) {
                j2 = Long.parseLong(this._reader.getElementText());
            } else {
                XMLStreamUtils.skipElement(this._reader);
            }
        }
        if (z) {
            return null;
        }
        return new WikiPage(wikiTitle, j, j2, str);
    }
}
