package org.aksw.gerbil.dataset.impl.msnbc;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Span;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/aksw/gerbil/dataset/impl/msnbc/MSNBCDataset.class */
public class MSNBCDataset extends AbstractDataset implements InitializableDataset, Comparator<Span> {
    private static final Logger LOGGER = LoggerFactory.getLogger(MSNBCDataset.class);
    protected List<Document> documents;
    protected String textsDirectory;
    protected String annotationsDirectory;

    public MSNBCDataset(String str, String str2) throws GerbilException {
        this.textsDirectory = str;
        this.annotationsDirectory = str2;
    }

    @Override // org.aksw.gerbil.dataset.Dataset
    public int size() {
        return this.documents.size();
    }

    @Override // org.aksw.gerbil.dataset.Dataset
    public List<Document> getInstances() {
        return this.documents;
    }

    @Override // org.aksw.gerbil.dataset.InitializableDataset
    public void init() throws GerbilException {
        this.documents = loadDocuments(new File(this.textsDirectory), new File(this.annotationsDirectory));
    }

    protected List<Document> loadDocuments(File file, File file2) throws GerbilException {
        if (!file.exists() || !file.isDirectory()) {
            throw new GerbilException("The given text directory (" + file.getAbsolutePath() + ") is not existing or not a directory.", ErrorTypes.DATASET_LOADING_ERROR);
        }
        String absolutePath = file.getAbsolutePath();
        if (!absolutePath.endsWith(File.separator)) {
            absolutePath = absolutePath + File.separator;
        }
        if (!file2.exists() || !file2.isDirectory()) {
            throw new GerbilException("The given annotation directory (" + file2.getAbsolutePath() + ") is not existing or not a directory.", ErrorTypes.DATASET_LOADING_ERROR);
        }
        MSNBC_XMLParser mSNBC_XMLParser = new MSNBC_XMLParser();
        ArrayList arrayList = new ArrayList();
        for (File file3 : file2.listFiles()) {
            try {
                MSNBC_Result parseAnnotationsFile = mSNBC_XMLParser.parseAnnotationsFile(file3);
                if (parseAnnotationsFile.getTextFileName() == null) {
                    throw new GerbilException("The parsed annotation file (\"" + file3.getAbsolutePath() + "\" did not define a text file name.", ErrorTypes.DATASET_LOADING_ERROR);
                }
                try {
                    arrayList.add(createDocument(parseAnnotationsFile.getTextFileName(), FileUtils.readFileToString(new File(absolutePath + parseAnnotationsFile.getTextFileName())), parseAnnotationsFile));
                } catch (IOException e) {
                    throw new GerbilException("Couldn't read text file \"" + absolutePath + parseAnnotationsFile.getTextFileName() + "\" mentioned in the annotations file \"" + file3.getAbsolutePath() + "\".", e, ErrorTypes.DATASET_LOADING_ERROR);
                }
            } catch (Exception e2) {
                throw new GerbilException("Couldn't parse given annotation file (\"" + file3.getAbsolutePath() + "\".", e2, ErrorTypes.DATASET_LOADING_ERROR);
            }
        }
        return arrayList;
    }

    protected Document createDocument(String str, String str2, MSNBC_Result mSNBC_Result) {
        String generateDocumentUri = generateDocumentUri(str);
        ArrayList arrayList = new ArrayList(mSNBC_Result.getMarkings().size());
        for (MSNBC_NamedEntity mSNBC_NamedEntity : mSNBC_Result.getMarkings()) {
            String substring = str2.substring(mSNBC_NamedEntity.getStartPosition(), mSNBC_NamedEntity.getStartPosition() + mSNBC_NamedEntity.getLength());
            if (!substring.equals(mSNBC_NamedEntity.getSurfaceForm())) {
                LOGGER.warn("In document " + generateDocumentUri + ", the expected surface form of the named entity " + mSNBC_NamedEntity + " does not fit the surface form derived from the text \"" + substring + "\".");
            }
            addDBpediaUris(mSNBC_NamedEntity.getUris());
            arrayList.add(mSNBC_NamedEntity.toNamedEntity());
        }
        DocumentImpl documentImpl = new DocumentImpl(str2, generateDocumentUri, arrayList);
        mergeSubNamedEntity(documentImpl);
        return documentImpl;
    }

    private void mergeSubNamedEntity(Document document) {
        List markings = document.getMarkings(NamedEntity.class);
        NamedEntity[] namedEntityArr = (NamedEntity[]) markings.toArray(new NamedEntity[markings.size()]);
        Arrays.sort(namedEntityArr, this);
        HashSet hashSet = new HashSet();
        for (int i = 0; i < namedEntityArr.length; i++) {
            boolean z = false;
            for (int i2 = i + 1; i2 < namedEntityArr.length && !z; i2++) {
                if (namedEntityArr[i].getStartPosition() >= namedEntityArr[i2].getStartPosition() && namedEntityArr[i].getStartPosition() + namedEntityArr[i].getLength() <= namedEntityArr[i2].getStartPosition() + namedEntityArr[i2].getLength()) {
                    z = false;
                    Iterator it = namedEntityArr[i].getUris().iterator();
                    while (!z && it.hasNext()) {
                        z = namedEntityArr[i2].containsUri((String) it.next());
                    }
                    if (z) {
                        namedEntityArr[i2].getUris().addAll(namedEntityArr[i2].getUris());
                        hashSet.add(namedEntityArr[i]);
                    } else {
                        LOGGER.debug("There are two overlapping named entities with different URI sets. {}, {}", namedEntityArr[i], namedEntityArr[i2]);
                    }
                }
            }
        }
        document.getMarkings().removeAll(hashSet);
    }

    protected String generateDocumentUri(String str) {
        return "http://" + this.name + '/' + str;
    }

    protected static void addDBpediaUris(Set<String> set) {
        ArrayList arrayList = new ArrayList(set.size());
        for (String str : set) {
            if (str.contains("en.wikipedia.org/wiki")) {
                arrayList.add(str.replace("en.wikipedia.org/wiki", "dbpedia.org/resource"));
            } else {
                arrayList.add(str.replace("wikipedia.org/wiki", "dbpedia.org/resource"));
            }
        }
        set.addAll(arrayList);
    }

    @Override // java.util.Comparator
    public int compare(Span span, Span span2) {
        int length = span.getLength() - span2.getLength();
        if (length == 0) {
            return 0;
        }
        return length < 0 ? -1 : 1;
    }
}
