package org.aksw.gerbil.dataset.impl.iitb;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.aksw.gerbil.utils.WikipediaHelper;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/aksw/gerbil/dataset/impl/iitb/IITBDataset.class */
public class IITBDataset extends AbstractDataset implements InitializableDataset {
    private static final Logger LOGGER = LoggerFactory.getLogger(IITBDataset.class);
    protected List<Document> documents;
    protected String textsDirectory;
    protected String annotationsDirectory;
    protected int unknownEntitiesCount = 0;

    public IITBDataset(String str, String str2) throws GerbilException {
        this.textsDirectory = str;
        this.annotationsDirectory = str2;
    }

    @Override // org.aksw.gerbil.dataset.Dataset
    public int size() {
        return this.documents.size();
    }

    @Override // org.aksw.gerbil.dataset.Dataset
    public List<Document> getInstances() {
        return this.documents;
    }

    @Override // org.aksw.gerbil.dataset.InitializableDataset
    public void init() throws GerbilException {
        this.documents = loadDocuments(new File(this.textsDirectory), new File(this.annotationsDirectory));
    }

    protected List<Document> loadDocuments(File file, File file2) throws GerbilException {
        if (!file.exists() || !file.isDirectory()) {
            throw new GerbilException("The given text directory (" + file.getAbsolutePath() + ") is not existing or not a directory.", ErrorTypes.DATASET_LOADING_ERROR);
        }
        String absolutePath = file.getAbsolutePath();
        if (!absolutePath.endsWith(File.separator)) {
            absolutePath = absolutePath + File.separator;
        }
        if (!file2.exists()) {
            throw new GerbilException("The given annotation file (" + file2.getAbsolutePath() + ") does not exist.", ErrorTypes.DATASET_LOADING_ERROR);
        }
        Map<String, Set<IITB_Annotation>> loadAnnotations = loadAnnotations(file2);
        ArrayList arrayList = new ArrayList();
        for (String str : loadAnnotations.keySet()) {
            try {
                arrayList.add(createDocument(str, FileUtils.readFileToString(new File(absolutePath + str)), loadAnnotations.get(str)));
            } catch (IOException e) {
                throw new GerbilException("Couldn't read text file \"" + absolutePath + str + "\".", e, ErrorTypes.DATASET_LOADING_ERROR);
            }
        }
        return arrayList;
    }

    protected Map<String, Set<IITB_Annotation>> loadAnnotations(File file) throws GerbilException {
        try {
            return new IITB_XMLParser().parseAnnotationsFile(file);
        } catch (Exception e) {
            throw new GerbilException("Couldn't parse given annotation file (\"" + file.getAbsolutePath() + "\".", e, ErrorTypes.DATASET_LOADING_ERROR);
        }
    }

    protected Document createDocument(String str, String str2, Set<IITB_Annotation> set) {
        String generateDocumentUri = generateDocumentUri(str);
        ArrayList arrayList = new ArrayList(set.size());
        for (IITB_Annotation iITB_Annotation : set) {
            int i = iITB_Annotation.offset + iITB_Annotation.length;
            if (iITB_Annotation.offset > 0 && Character.isAlphabetic(str2.charAt(iITB_Annotation.offset - 1))) {
                LOGGER.warn("In document " + generateDocumentUri + ", the named entity \"" + str2.substring(iITB_Annotation.offset, iITB_Annotation.offset + iITB_Annotation.length) + "\" has an alphabetic character in front of it (\"" + str2.charAt(iITB_Annotation.offset - 1) + "\").");
            }
            if (Character.isWhitespace(str2.charAt(iITB_Annotation.offset))) {
                LOGGER.warn("In document " + generateDocumentUri + ", the named entity \"" + str2.substring(iITB_Annotation.offset, i) + "\" starts with a whitespace.");
            }
            if (i < str2.length() && Character.isAlphabetic(str2.charAt(i))) {
                LOGGER.warn("In document " + generateDocumentUri + ", the named entity \"" + str2.substring(iITB_Annotation.offset, i) + "\" has an alphabetic character directly behind it (\"" + str2.charAt(i) + "\").");
            }
            if (Character.isWhitespace(str2.charAt(i - 1))) {
                LOGGER.warn("In document " + generateDocumentUri + ", the named entity \"" + str2.substring(iITB_Annotation.offset, iITB_Annotation.offset + iITB_Annotation.length) + "\" ends with a whitespace.");
            }
            Set<String> generateUriSet = WikipediaHelper.generateUriSet(iITB_Annotation.wikiTitle);
            if (generateUriSet.size() == 0) {
                generateUriSet.add(generateEntityUri());
            }
            arrayList.add(new NamedEntity(iITB_Annotation.offset, iITB_Annotation.length, generateUriSet));
        }
        return new DocumentImpl(str2, generateDocumentUri, arrayList);
    }

    private String generateEntityUri() {
        this.unknownEntitiesCount++;
        return "http://" + this.name + "/notInWiki/entity_" + this.unknownEntitiesCount;
    }

    protected String generateDocumentUri(String str) {
        return "http://" + this.name + '/' + str;
    }
}
