package org.aksw.gerbil.dataset.impl.aida;

import au.com.bytecode.opencsv.CSVReader;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.aksw.gerbil.utils.WikipediaHelper;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/aksw/gerbil/dataset/impl/aida/AIDACoNLLDataset.class */
public class AIDACoNLLDataset extends AbstractDataset implements InitializableDataset {
    private static final Logger LOGGER = LoggerFactory.getLogger(AIDACoNLLDataset.class);
    private static final char SEPARATION_CHAR = '\t';
    private static final char QUOTATION_CHAR = 0;
    private static final int TEXT_INDEX = 0;
    private static final int NE_TYPE_INDEX = 1;
    private static final int ANNOTATION_SURFACE_FORM_INDEX = 2;
    private static final int ANNOTATION_TITLE_INDEX = 3;
    private static final int ANNOTATION_URI_INDEX = 4;
    private static final String DOCUMENT_START_TAG = "-DOCSTART-";
    private static final String ANNOTATION_FIRST_WORD_TAG = "B";
    private static final String ANNOTATION_NOT_IN_WIKI_TAG = "--NME--";
    private static final String WIKIPEDIA_URI_START = "http://en.wikipedia.org/wiki/";
    private String file;
    private List<Document> documents;
    private int firstDocId;
    private int lastDocId;

    public AIDACoNLLDataset(String str) {
        this(str, -1, -1);
    }

    public AIDACoNLLDataset(String str, String str2, String str3) {
        this(str, Integer.parseInt(str2), Integer.parseInt(str3));
    }

    public AIDACoNLLDataset(String str, int i, int i2) {
        this.file = str;
        this.firstDocId = i;
        this.lastDocId = i2;
    }

    @Override // org.aksw.gerbil.dataset.Dataset
    public int size() {
        return this.documents.size();
    }

    @Override // org.aksw.gerbil.dataset.Dataset
    public List<Document> getInstances() {
        return this.documents;
    }

    @Override // org.aksw.gerbil.dataset.InitializableDataset
    public void init() throws GerbilException {
        this.documents = loadDocuments(new File(this.file));
        if (this.firstDocId <= 0 || this.lastDocId <= 0) {
            return;
        }
        this.documents = this.documents.subList(this.firstDocId - 1, this.lastDocId);
    }

    protected List<Document> loadDocuments(File file) throws GerbilException {
        Set<String> hashSet;
        String str = "http://" + getName() + "/";
        BufferedReader bufferedReader = null;
        CSVReader cSVReader = null;
        ArrayList arrayList = new ArrayList();
        try {
            try {
                bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8")));
                cSVReader = new CSVReader(bufferedReader, '\t', (char) 0);
                Document document = null;
                StringBuilder sb = new StringBuilder();
                ArrayList arrayList2 = null;
                NamedEntity namedEntity = null;
                boolean z = false;
                boolean z2 = true;
                for (String[] readNext = cSVReader.readNext(); readNext != null; readNext = cSVReader.readNext()) {
                    if (readNext.length > 0) {
                        if (readNext[0].startsWith(DOCUMENT_START_TAG)) {
                            if (document != null) {
                                document.setText(sb.toString().trim());
                                sb.setLength(0);
                                z = false;
                            }
                            arrayList2 = new ArrayList();
                            document = new DocumentImpl((String) null, str + arrayList.size(), arrayList2);
                            arrayList.add(document);
                        } else if (!readNext[0].isEmpty()) {
                            boolean z3 = z2;
                            z2 = true;
                            if (sb.length() > 0 && readNext[0].length() >= 1) {
                                if (readNext[0].length() == 1) {
                                    switch (readNext[0].charAt(0)) {
                                        case '!':
                                        case ')':
                                        case ',':
                                        case '.':
                                        case '?':
                                        case ']':
                                        case '}':
                                            z3 = false;
                                            break;
                                        case '\"':
                                            if (z) {
                                                z3 = false;
                                            } else {
                                                z2 = false;
                                            }
                                            z = !z;
                                            break;
                                        case '(':
                                        case '[':
                                        case '{':
                                            z2 = false;
                                            break;
                                    }
                                } else if (!Character.isLetterOrDigit(readNext[0].charAt(0))) {
                                    z3 = false;
                                }
                                if (z3) {
                                    sb.append(' ');
                                }
                            }
                            if (readNext.length <= 1 || readNext[1].isEmpty()) {
                                namedEntity = null;
                            } else if (readNext[1].equals(ANNOTATION_FIRST_WORD_TAG)) {
                                if (readNext[3].equals(ANNOTATION_NOT_IN_WIKI_TAG)) {
                                    hashSet = generateArtificialUri(str, readNext[2]);
                                } else {
                                    if (readNext[4].startsWith(WIKIPEDIA_URI_START)) {
                                        hashSet = WikipediaHelper.generateUriSet(readNext[4].substring(WIKIPEDIA_URI_START.length()));
                                    } else {
                                        LOGGER.warn("Found a URI that is not part of the English Wikipedia \"{}\". This was not expected.", readNext[4]);
                                        hashSet = new HashSet();
                                    }
                                    hashSet.add(readNext[4]);
                                }
                                namedEntity = new NamedEntity(sb.length(), 0, hashSet);
                                arrayList2.add(namedEntity);
                            }
                            sb.append(readNext[0]);
                            if (namedEntity != null) {
                                namedEntity.setLength(sb.length() - namedEntity.getStartPosition());
                            }
                        }
                    }
                }
                if (document != null) {
                    document.setText(sb.toString().trim());
                    sb.setLength(0);
                }
                IOUtils.closeQuietly(cSVReader);
                IOUtils.closeQuietly(bufferedReader);
                return arrayList;
            } catch (IOException e) {
                throw new GerbilException("Couldn't read dataset file.", e, ErrorTypes.DATASET_LOADING_ERROR);
            }
        } catch (Throwable th) {
            IOUtils.closeQuietly(cSVReader);
            IOUtils.closeQuietly(bufferedReader);
            throw th;
        }
    }

    protected Set<String> generateArtificialUri(String str, String str2) throws GerbilException {
        StringBuilder sb = new StringBuilder();
        sb.append(str);
        sb.append("notInWiki/");
        try {
            sb.append(URLEncoder.encode(str2, "UTF-8"));
            HashSet hashSet = new HashSet(2);
            hashSet.add(sb.toString());
            return hashSet;
        } catch (UnsupportedEncodingException e) {
            LOGGER.error("Couldn't encode surface form data.", e);
            throw new GerbilException("Couldn't encode surface form data.", e, ErrorTypes.DATASET_LOADING_ERROR);
        }
    }
}
