package org.aksw.gerbil.tools;

import java.io.IOException;
import java.io.PrintStream;
import java.io.StringReader;
import java.util.Iterator;
import java.util.List;
import org.aksw.gerbil.dataset.Dataset;
import org.aksw.gerbil.dataset.DatasetConfiguration;
import org.aksw.gerbil.datatypes.ExperimentType;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.web.config.DatasetsConfig;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/aksw/gerbil/tools/DatasetAnalyzer.class */
public class DatasetAnalyzer {
    private static final Logger LOGGER = LoggerFactory.getLogger(DatasetAnalyzer.class);
    private PrintStream output;

    public static void main(String[] strArr) {
        List<DatasetConfiguration> configurations = DatasetsConfig.datasets(null, null).getConfigurations();
        PrintStream printStream = null;
        try {
            try {
                printStream = new PrintStream("datasetAnalyzation.log");
                printStream.println("name,entitiesPerDoc, entitiesPerToken, avgDocumentLength,numberOfDocuments,numberOfEntities, amountOfPersons, amountOfOrganizations, amountOfLocations, amountOfOthers");
                DatasetAnalyzer datasetAnalyzer = new DatasetAnalyzer(printStream);
                Iterator<DatasetConfiguration> it = configurations.iterator();
                while (it.hasNext()) {
                    try {
                        datasetAnalyzer.analyzeDataset(it.next());
                    } catch (GerbilException e) {
                        e.printStackTrace();
                    }
                }
                IOUtils.closeQuietly(printStream);
            } catch (Exception e2) {
                e2.printStackTrace();
                IOUtils.closeQuietly(printStream);
            }
        } catch (Throwable th) {
            IOUtils.closeQuietly(printStream);
            throw th;
        }
    }

    public DatasetAnalyzer(PrintStream printStream) {
        this.output = printStream;
    }

    public void analyzeDataset(DatasetConfiguration datasetConfiguration) throws GerbilException {
        if (datasetConfiguration.isApplicableForExperiment(ExperimentType.D2KB)) {
            analyze(datasetConfiguration, ExperimentType.D2KB);
            return;
        }
        if (datasetConfiguration.isApplicableForExperiment(ExperimentType.OKE_Task2)) {
            analyze(datasetConfiguration, ExperimentType.OKE_Task2);
        } else if (datasetConfiguration.isApplicableForExperiment(ExperimentType.C2KB)) {
            analyze(datasetConfiguration, ExperimentType.C2KB);
        } else {
            LOGGER.error("Can not analyze the dataset with the following config: " + datasetConfiguration.toString());
        }
    }

    private int countTokensInText(String str) {
        WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer();
        whitespaceTokenizer.setReader(new StringReader(str));
        int i = 0;
        while (whitespaceTokenizer.incrementToken()) {
            try {
                try {
                    i++;
                } catch (IOException e) {
                    LOGGER.error("Error while tokenizing text. Returning.", e);
                    IOUtils.closeQuietly(whitespaceTokenizer);
                }
            } finally {
                IOUtils.closeQuietly(whitespaceTokenizer);
            }
        }
        return i;
    }

    private void analyze(DatasetConfiguration datasetConfiguration, ExperimentType experimentType) throws GerbilException {
        Dataset dataset = datasetConfiguration.getDataset(experimentType);
        if (dataset == null) {
            return;
        }
        this.output.print(datasetConfiguration.getName());
        this.output.print(',');
        List<Document> instances = dataset.getInstances();
        int i = 0;
        int i2 = 0;
        for (Document document : instances) {
            i += document.getMarkings().size();
            i2 += countTokensInText(document.getText());
        }
        this.output.print(i / instances.size());
        this.output.print(',');
        this.output.print(i / i2);
        this.output.print(',');
        this.output.print(i2 / instances.size());
        this.output.print(',');
        this.output.print(instances.size());
        this.output.print(',');
        this.output.print(i);
        this.output.print(',');
        this.output.println();
    }
}
