package org.aksw.gerbil.dataset.impl.micro;

import au.com.bytecode.opencsv.CSVReader;
import com.carrotsearch.hppc.IntArrayList;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/aksw/gerbil/dataset/impl/micro/Microposts2014Dataset.class */
public class Microposts2014Dataset extends AbstractDataset implements InitializableDataset {
    private static final Logger LOGGER = LoggerFactory.getLogger(Microposts2014Dataset.class);
    private static final char SEPARATION_CHAR = '\t';
    private static final int TWEET_ID_INDEX = 0;
    private static final int TWEET_TEXT_INDEX = 1;
    private static final int FIRST_ANNOTATION_INDEX = 2;
    protected List<Document> documents;
    private String tweetsFile;

    public Microposts2014Dataset(String str) {
        this.tweetsFile = str;
    }

    @Override // org.aksw.gerbil.dataset.Dataset
    public int size() {
        return this.documents.size();
    }

    @Override // org.aksw.gerbil.dataset.Dataset
    public List<Document> getInstances() {
        return this.documents;
    }

    @Override // org.aksw.gerbil.dataset.InitializableDataset
    public void init() throws GerbilException {
        this.documents = loadDocuments(new File(this.tweetsFile));
    }

    /* JADX WARN: Finally extract failed */
    protected List<Document> loadDocuments(File file) throws GerbilException {
        BufferedReader bufferedReader = null;
        CSVReader cSVReader = null;
        ArrayList arrayList = new ArrayList();
        String str = "http//:" + getName() + "/";
        try {
            try {
                bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8")));
                cSVReader = new CSVReader(bufferedReader, '\t');
                for (String[] readNext = cSVReader.readNext(); readNext != null; readNext = cSVReader.readNext()) {
                    if ((readNext.length & 1) != 0) {
                        throw new GerbilException("Dataset is malformed. Each line shoud have an even number of cells. Malformed line = " + Arrays.toString(readNext), ErrorTypes.DATASET_LOADING_ERROR);
                    }
                    String trim = readNext[1].substring(readNext[1].startsWith("\"") ? 1 : 0, readNext[1].endsWith("\"") ? readNext[1].length() - 1 : readNext[1].length()).trim();
                    arrayList.add(new DocumentImpl(trim, str + readNext[0], findMarkings(readNext, trim)));
                }
                IOUtils.closeQuietly(cSVReader);
                IOUtils.closeQuietly(bufferedReader);
                return arrayList;
            } catch (IOException e) {
                throw new GerbilException("Exception while reading dataset.", e, ErrorTypes.DATASET_LOADING_ERROR);
            }
        } catch (Throwable th) {
            IOUtils.closeQuietly(cSVReader);
            IOUtils.closeQuietly(bufferedReader);
            throw th;
        }
    }

    protected static List<Marking> findMarkings(String[] strArr, String str) {
        ArrayList arrayList = new ArrayList(strArr.length / 2);
        String str2 = null;
        IntArrayList intArrayList = new IntArrayList();
        int i = 0;
        int i2 = 2;
        while (true) {
            int i3 = i2;
            if (i3 >= strArr.length) {
                return arrayList;
            }
            int indexOf = str.indexOf(strArr[i3], i);
            if (indexOf < 0) {
                if (str2 == null) {
                    int indexOf2 = str.indexOf(35);
                    while (true) {
                        int i4 = indexOf2;
                        if (i4 < 0) {
                            break;
                        }
                        intArrayList.add(i4);
                        indexOf2 = str.indexOf(35, i4 + 1);
                    }
                    str2 = str.replaceAll("#", "");
                }
                for (int i5 = 0; i3 < intArrayList.elementsCount && intArrayList.buffer[i5] < i; i5++) {
                    i--;
                }
                indexOf = str2.indexOf(strArr[i3], i);
                if (indexOf >= 0) {
                    i = indexOf + strArr[i3].length();
                    for (int i6 = 0; i6 < intArrayList.elementsCount && intArrayList.buffer[i6] < i; i6++) {
                        i++;
                        if (intArrayList.buffer[i6] < indexOf) {
                            indexOf++;
                        }
                    }
                }
            } else {
                i = indexOf + strArr[i3].length();
            }
            if (indexOf < 0) {
                LOGGER.warn("Couldn't find \"{}\" inside \"{}\". This annotation will be ignored.", strArr[i3], str);
            } else {
                arrayList.add(new NamedEntity(indexOf, i - indexOf, strArr[i3 + 1]));
            }
            i2 = i3 + 2;
        }
    }
}
