package edu.umd.cloud9.collection.wikipedia;

import edu.umd.cloud9.collection.Indexable;
import info.bliki.wiki.filter.PlainTextConverter;
import info.bliki.wiki.model.WikiModel;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.WritableUtils;

/* loaded from: input_file:edu/umd/cloud9/collection/wikipedia/WikipediaPage.class */
public class WikipediaPage extends Indexable {
    public static final String XML_START_TAG = "<page>";
    public static final String XML_END_TAG = "</page>";
    private String mPage;
    private String mTitle;
    private String mId;
    private int mTextStart;
    private int mTextEnd;
    private boolean mIsRedirect;
    private boolean mIsDisambig;
    private boolean mIsStub;
    private WikiModel mWikiModel = new WikiModel("", "");
    private PlainTextConverter mTextConverter = new PlainTextConverter();

    public void write(DataOutput dataOutput) throws IOException {
        byte[] bytes = this.mPage.getBytes();
        WritableUtils.writeVInt(dataOutput, bytes.length);
        dataOutput.write(bytes, 0, bytes.length);
    }

    public void readFields(DataInput dataInput) throws IOException {
        int readVInt = WritableUtils.readVInt(dataInput);
        byte[] bArr = new byte[readVInt];
        dataInput.readFully(bArr, 0, readVInt);
        readPage(this, new String(bArr));
    }

    @Override // edu.umd.cloud9.collection.Indexable
    public String getDocid() {
        return this.mId;
    }

    @Override // edu.umd.cloud9.collection.Indexable
    public String getContent() {
        this.mWikiModel.setUp();
        String str = String.valueOf(getTitle()) + "\n" + this.mWikiModel.render(this.mTextConverter, getWikiMarkup());
        this.mWikiModel.tearDown();
        return str.replace("&amp;nbsp;", " ").replaceAll("&lt;references */&gt;", "").replaceAll("\\{\\{.*?\\}\\}", "").replaceAll("&#60;ref name.*?&#60;/ref&#62;", "").replaceAll("&lt;ref&gt;http:.*?&lt;/ref&gt;", "");
    }

    @Override // edu.umd.cloud9.collection.Indexable
    public String getDisplayContent() {
        this.mWikiModel.setUp();
        String str = "<h1>" + getTitle() + "</h1>\n" + this.mWikiModel.render(getWikiMarkup());
        this.mWikiModel.tearDown();
        return str.replace("&#38;nbsp;", " ").replace("&#60;references /&#62;", "").replaceAll("\\{\\{.*?\\}\\}", "").replaceAll("&#60;ref name.*?&#60;/ref&#62;", "");
    }

    @Override // edu.umd.cloud9.collection.Indexable
    public String getDisplayContentType() {
        return "text/html";
    }

    public String getRawXML() {
        return this.mPage;
    }

    public String getWikiMarkup() {
        if (this.mTextStart == -1) {
            return null;
        }
        return this.mPage.substring(this.mTextStart + 27, this.mTextEnd);
    }

    public String getTitle() {
        return this.mTitle;
    }

    public boolean isDisambiguation() {
        return this.mIsDisambig;
    }

    public boolean isRedirect() {
        return this.mIsRedirect;
    }

    public boolean isEmpty() {
        return this.mTextStart == -1;
    }

    public boolean isStub() {
        return this.mIsStub;
    }

    public String findInterlanguageLink(String str) {
        int indexOf;
        int indexOf2 = this.mPage.indexOf("[[" + str + ":");
        if (indexOf2 < 0 || (indexOf = this.mPage.indexOf("]]", indexOf2)) < 0) {
            return null;
        }
        String substring = this.mPage.substring(indexOf2 + 3 + str.length(), indexOf);
        if (substring.indexOf("\n") == -1 && substring.length() != 0) {
            return substring;
        }
        return null;
    }

    public List<String> extractLinkDestinations() {
        int indexOf;
        int i = 0;
        ArrayList arrayList = new ArrayList();
        while (true) {
            int indexOf2 = this.mPage.indexOf("[[", i);
            if (indexOf2 >= 0 && (indexOf = this.mPage.indexOf("]]", indexOf2)) >= 0) {
                String substring = this.mPage.substring(indexOf2 + 2, indexOf);
                if (substring.length() == 0) {
                    i = indexOf + 1;
                } else if (substring.indexOf(":") != -1) {
                    i = indexOf + 1;
                } else {
                    int indexOf3 = substring.indexOf("|");
                    if (indexOf3 != -1) {
                        substring = substring.substring(0, indexOf3);
                    }
                    int indexOf4 = substring.indexOf("#");
                    if (indexOf4 != -1) {
                        substring = substring.substring(0, indexOf4);
                    }
                    if (substring.length() == 0) {
                        i = indexOf + 1;
                    } else {
                        arrayList.add(substring.trim());
                        i = indexOf + 1;
                    }
                }
            }
        }
        return arrayList;
    }

    public static void readPage(WikipediaPage wikipediaPage, String str) {
        wikipediaPage.mPage = str;
        int indexOf = str.indexOf("<title>");
        wikipediaPage.mTitle = str.substring(indexOf + 7, str.indexOf("</title>", indexOf));
        wikipediaPage.mId = str.substring(str.indexOf("<id>") + 4, str.indexOf("</id>"));
        wikipediaPage.mTextStart = str.indexOf("<text xml:space=\"preserve\">");
        wikipediaPage.mTextEnd = str.indexOf("</text>", wikipediaPage.mTextStart);
        wikipediaPage.mIsDisambig = str.indexOf("{{disambig}}", wikipediaPage.mTextStart) != -1;
        wikipediaPage.mIsRedirect = str.substring(wikipediaPage.mTextStart + 27, wikipediaPage.mTextStart + 36).compareTo("#REDIRECT") == 0;
        wikipediaPage.mIsStub = str.indexOf("stub}}", wikipediaPage.mTextStart) != -1;
    }

    private static String parseAndCleanPage2(String str) {
        int i = 0;
        String str2 = "";
        for (String str3 : str.split("\n")) {
            boolean z = false;
            Matcher matcher = Pattern.compile("\\{\\|").matcher(str3);
            Matcher matcher2 = Pattern.compile("\\|\\}").matcher(str3);
            if (i == 0) {
                int count = getCount(matcher);
                if (count > 0) {
                    i = count - getCount(matcher2);
                    z = true;
                }
            } else {
                i += getCount(matcher) - getCount(matcher2);
                z = true;
            }
            if (i == 0 && !z) {
                String replaceAll = Pattern.compile("!--.+--").matcher(Pattern.compile("\\[http.+\\]").matcher(Pattern.compile("&quot").matcher(Pattern.compile("\\'\\'").matcher(Pattern.compile("``").matcher(Pattern.compile("\\'\\'\\'").matcher(Pattern.compile("```").matcher(str3).replaceAll("")).replaceAll("")).replaceAll("")).replaceAll("")).replaceAll("")).replaceAll("")).replaceAll("");
                Matcher matcher3 = Pattern.compile(" (\\S)+\\|(\\S)+ ").matcher(replaceAll);
                if (matcher3.matches()) {
                    replaceAll = matcher3.replaceAll(" $2 ");
                }
                replaceAll.replaceAll("\\|", " | ");
                if (!Pattern.compile("^\\*.*").matcher(replaceAll).matches() && !Pattern.compile("&lt;.*").matcher(replaceAll).matches() && !Pattern.compile("&gt;.*").matcher(replaceAll).matches() && !Pattern.compile("1\\s+").matcher(replaceAll).matches() && !Pattern.compile("\\w\\w:").matcher(replaceAll).matches() && !Pattern.compile("^\\s*").matcher(replaceAll).matches() && !Pattern.compile("\\=+.+\\=+").matcher(replaceAll).matches() && !Pattern.compile("^\\|\\-.+").matcher(replaceAll).matches() && !Pattern.compile("Kategorie:.+").matcher(replaceAll).matches() && !Pattern.compile("\\w\\w:.+").matcher(replaceAll).matches()) {
                    str2 = String.valueOf(str2) + replaceAll + "\n";
                }
            }
        }
        return str2;
    }

    public static String parseAndCleanPage(String str) {
        String str2 = "";
        int i = 0;
        for (String str3 : str.split("\n")) {
            boolean z = false;
            Matcher matcher = Pattern.compile("\\{\\{").matcher(str3);
            Matcher matcher2 = Pattern.compile("\\}\\}").matcher(str3);
            if (i == 0) {
                int count = getCount(matcher);
                if (count == 0 && str3.contains("{{")) {
                    throw new RuntimeException();
                }
                if (count > 0) {
                    i = count - getCount(matcher2);
                    z = true;
                }
            } else {
                i += getCount(matcher) - getCount(matcher2);
                z = true;
            }
            if (i == 0 && !z) {
                String replaceAll = Pattern.compile("\\]\\]").matcher(Pattern.compile("\\[\\[").matcher(str3).replaceAll("")).replaceAll("");
                if (!Pattern.compile("\\*").matcher(replaceAll).matches()) {
                    str2 = String.valueOf(str2) + replaceAll + "\n";
                }
            }
        }
        return parseAndCleanPage2(str2);
    }

    static int getCount(Matcher matcher) {
        int i = 0;
        while (matcher.find()) {
            i++;
        }
        return i;
    }
}
