/*
 * Decompiled with CFR 0.152.
 */
package org.aksw.simba.tapioca.gen;

import com.carrotsearch.hppc.ObjectOpenHashSet;
import java.io.File;
import java.util.ArrayList;
import org.aksw.simba.tapioca.data.DatasetClassInfo;
import org.aksw.simba.tapioca.data.DatasetPropertyInfo;
import org.aksw.simba.tapioca.data.DatasetSpecialClassesInfo;
import org.aksw.simba.tapioca.data.DatasetVocabularies;
import org.aksw.simba.tapioca.data.SimpleTokenizedText;
import org.aksw.simba.tapioca.data.StringCountMapping;
import org.aksw.simba.tapioca.gen.MetaDataInformationCollector;
import org.aksw.simba.tapioca.gen.data.DatasetURIs;
import org.aksw.simba.tapioca.gen.preprocessing.DatasetURIsSummarizingSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.UriCountMappingCreatingDocumentSupplierDecorator;
import org.dice_research.topicmodeling.io.gzip.GZipCorpusObjectWriter;
import org.dice_research.topicmodeling.io.xml.stream.StreamBasedXmlDocumentSupplier;
import org.dice_research.topicmodeling.preprocessing.ListCorpusCreator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.DocumentSupplier;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.DocumentFilteringSupplierDecorator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.PropertyRemovingSupplierDecorator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.filter.DocumentFilter;
import org.dice_research.topicmodeling.utils.corpus.DocumentListCorpus;
import org.dice_research.topicmodeling.utils.doc.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class URIBasedIndexGenerator {
    private static final Logger LOGGER = LoggerFactory.getLogger(URIBasedIndexGenerator.class);
    public static final String BL_CORPUS_FILE = "/Daten/tapioca/lodStats_BL.object";
    public static final String FINAL_CORPUS_FILE = "lodStats_BL_final.corpus";

    public static void main(String[] args) {
        URIBasedIndexGenerator generator = new URIBasedIndexGenerator();
        generator.run();
    }

    public void run() {
        File datasetDescriptionsFile;
        File outputFolder = new File("/Daten/tapioca/lodStats_model");
        if (!outputFolder.exists()) {
            outputFolder.mkdirs();
        }
        if ((datasetDescriptionsFile = new File("/Daten/tapioca/lodStats_model" + File.separator + FINAL_CORPUS_FILE)).exists()) {
            LOGGER.info("The final corpus file is already existing.");
        } else {
            this.generateFinalCorpusFile();
        }
    }

    protected void generateFinalCorpusFile() {
        if (this.checkBLCorpusExistence()) {
            MetaDataInformationCollector collector = new MetaDataInformationCollector();
            LOGGER.info("Generating final corpus file...");
            collector.run("/Daten/tapioca/lodStats/datasets.nt", BL_CORPUS_FILE, "/Daten/tapioca/lodStats/statresult.nt", "/Daten/tapioca/lodStats_model" + File.separator + FINAL_CORPUS_FILE, "/Daten/tapioca/lodStats_model" + File.separator + "lodstats.nt");
        }
    }

    protected boolean checkBLCorpusExistence() {
        File blCorpusFile = new File(BL_CORPUS_FILE);
        if (!blCorpusFile.exists()) {
            LOGGER.warn("The BL corpus file is not existing. Trying to generate it...");
            this.generateBLCorpusFile();
            if (!blCorpusFile.exists()) {
                LOGGER.error("The BL corpus file is not existing and couldn't be generated.");
                return false;
            }
        }
        return true;
    }

    public static DocumentSupplier createBLPreprocessing(File inputFile) {
        Object supplier = StreamBasedXmlDocumentSupplier.createReader((File)inputFile, (boolean)true);
        StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetClassInfo.class);
        StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetSpecialClassesInfo.class);
        StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetPropertyInfo.class);
        StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetVocabularies.class);
        supplier = new UriCountMappingCreatingDocumentSupplierDecorator((DocumentSupplier)supplier, UriCountMappingCreatingDocumentSupplierDecorator.UriUsage.CLASSES_AND_PROPERTIES);
        supplier = new DatasetURIsSummarizingSupplierDecorator((DocumentSupplier)supplier);
        supplier = new DocumentFilteringSupplierDecorator((DocumentSupplier)supplier, new DocumentFilter(){

            public boolean isDocumentGood(Document document) {
                DatasetURIs uris = (DatasetURIs)document.getProperty(DatasetURIs.class);
                return uris != null && ((ObjectOpenHashSet)uris.get()).size() > 0;
            }
        });
        return supplier;
    }

    protected void generateBLCorpusFile() {
        DocumentSupplier supplier = URIBasedIndexGenerator.createBLPreprocessing(new File("/Daten/tapioca/lodStats.corpus"));
        ArrayList<Class> propertiesToRemove = new ArrayList<Class>();
        propertiesToRemove.add(DatasetVocabularies.class);
        propertiesToRemove.add(DatasetPropertyInfo.class);
        propertiesToRemove.add(DatasetSpecialClassesInfo.class);
        propertiesToRemove.add(DatasetClassInfo.class);
        propertiesToRemove.add(StringCountMapping.class);
        propertiesToRemove.add(SimpleTokenizedText.class);
        supplier = new PropertyRemovingSupplierDecorator(supplier, propertiesToRemove);
        ListCorpusCreator preprocessor = new ListCorpusCreator(supplier, new DocumentListCorpus(new ArrayList()));
        GZipCorpusObjectWriter writer = new GZipCorpusObjectWriter(new File(BL_CORPUS_FILE));
        writer.writeCorpus(preprocessor.getCorpus());
    }
}

