/*
 * Decompiled with CFR 0.152.
 */
package org.aksw.simba.tapioca.gen;

import java.io.BufferedOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.aksw.simba.tapioca.data.DatasetClassInfo;
import org.aksw.simba.tapioca.data.DatasetPropertyInfo;
import org.aksw.simba.tapioca.data.DatasetSpecialClassesInfo;
import org.aksw.simba.tapioca.data.DatasetVocabularies;
import org.aksw.simba.tapioca.data.SimpleTokenizedText;
import org.aksw.simba.tapioca.data.StringCountMapping;
import org.aksw.simba.tapioca.data.VocabularyBlacklist;
import org.aksw.simba.tapioca.preprocessing.SimpleBlankNodeRemovingDocumentSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.SimpleTokenizedTextTermFilter;
import org.aksw.simba.tapioca.preprocessing.SimpleWordIndexingSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.UriCountMappingCreatingDocumentSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.UriFilteringDocumentSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.labelretrieving.FileBasedTokenizedLabelRetriever;
import org.aksw.simba.tapioca.preprocessing.labelretrieving.LODCatLabelServiceBasedRetriever;
import org.aksw.simba.tapioca.preprocessing.labelretrieving.MongoDBBasedTokenizedLabelRetriever;
import org.aksw.simba.tapioca.preprocessing.labelretrieving.TokenizedLabelRetriever;
import org.aksw.simba.tapioca.preprocessing.labelretrieving.WorkerBasedLabelRetrievingDocumentSupplierDecorator;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.dice_research.topicmodeling.io.CorpusWriter;
import org.dice_research.topicmodeling.io.gzip.GZipCorpusWriterDecorator;
import org.dice_research.topicmodeling.io.java.CorpusObjectWriter;
import org.dice_research.topicmodeling.io.xml.XmlWritingDocumentConsumer;
import org.dice_research.topicmodeling.io.xml.stream.StreamBasedXmlDocumentSupplier;
import org.dice_research.topicmodeling.lang.postagging.PosTaggingTermFilter;
import org.dice_research.topicmodeling.lang.postagging.StandardEnglishPosTaggingTermFilter;
import org.dice_research.topicmodeling.preprocessing.ListCorpusCreator;
import org.dice_research.topicmodeling.preprocessing.docconsumer.DocumentConsumer;
import org.dice_research.topicmodeling.preprocessing.docsupplier.DocumentSupplier;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.DocumentConsumerAdaptingSupplierDecorator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.DocumentFilteringSupplierDecorator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.DocumentWordCountingSupplierDecorator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.PropertyRemovingSupplierDecorator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.filter.DocumentFilter;
import org.dice_research.topicmodeling.utils.corpus.Corpus;
import org.dice_research.topicmodeling.utils.corpus.DocumentListCorpus;
import org.dice_research.topicmodeling.utils.corpus.properties.CorpusProperty;
import org.dice_research.topicmodeling.utils.corpus.properties.CorpusVocabulary;
import org.dice_research.topicmodeling.utils.doc.Document;
import org.dice_research.topicmodeling.utils.doc.DocumentName;
import org.dice_research.topicmodeling.utils.doc.DocumentURI;
import org.dice_research.topicmodeling.utils.vocabulary.SimpleVocabulary;
import org.dice_research.topicmodeling.utils.vocabulary.Vocabulary;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class LDACorpusCreation {
    private static final Logger LOGGER = LoggerFactory.getLogger(LDACorpusCreation.class);
    @Deprecated
    public static final File[] CACHE_FILES = new File[]{new File("C:/Daten/tapioca/cache/uriToLabelCache_1.object"), new File("C:/Daten/tapioca/cache/uriToLabelCache_2.object"), new File("C:/Daten/tapioca/cache/uriToLabelCache_3.object")};
    @Deprecated
    public static final String CORPUS_NAME = "lodDiagram";
    @Deprecated
    public static final String CORPUS_FILE = "/Daten/tapioca/lodDiagram.corpus";
    protected final String inputFile;
    protected final String outputFile;
    protected final UriCountMappingCreatingDocumentSupplierDecorator.UriUsage uriUsage;
    protected final StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence wordOccurence;
    protected final boolean exportCorpusAsXml;

    public static void main(String[] args) throws IOException {
        UriCountMappingCreatingDocumentSupplierDecorator.UriUsage uriUsage;
        StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence wordOccurence;
        String outputFile;
        String inputFile;
        CommandLine cmd;
        block64: {
            block62: {
                String value;
                block61: {
                    block59: {
                        String value2;
                        Options options = new Options();
                        options.addOption("c", "cache-file", true, "a cache file that can be used to cache labels retrieved via HTTP. Only used in combination with -y.");
                        options.addOption("f", "word-frequency", true, "either \"u\" for unique or \"l\" for log. \"l\" is default.");
                        options.addOption("h", "mongo-db-host", true, "the host name of a MongoDB instance containing URI to label mappings");
                        options.addOption("l", "label-file", true, "a label file that should be used to retrieve labels");
                        options.addOption("n", "input-file", true, "the input corpus file");
                        options.addOption("o", "output-file", true, "the output corpus file");
                        options.addOption("p", "mongo-db-port", true, "the port of a MongoDB instance containing URI to label mappings");
                        options.addOption("s", "label-service", true, "the URL of a label retrieval service");
                        options.addOption("u", "uri-type", true, "either \"c\" for classes, \"p\" for properties or \"a\" for all. \"p\" is default.");
                        options.addOption("w", "workers", true, "number of workers used for retrieving labels");
                        options.addOption("x", "export-xml", false, "export the corpus as XML");
                        options.addOption("y", "http-client", false, "if set, labels for URIs are retrieved via HTTP. Note that this make take a lot of time!");
                        DefaultParser parser = new DefaultParser();
                        cmd = null;
                        try {
                            cmd = parser.parse(options, args);
                        }
                        catch (ParseException e) {
                            LOGGER.error("Couldn't parse commands. Aborting.", (Throwable)e);
                            return;
                        }
                        if (!cmd.hasOption("n")) {
                            LOGGER.error("Input file is not defined. Please provide an input file.");
                            return;
                        }
                        inputFile = cmd.getOptionValue("n");
                        if (!cmd.hasOption("o")) {
                            LOGGER.error("Output file is not defined. Please provide an Output file.");
                            return;
                        }
                        outputFile = cmd.getOptionValue("o");
                        wordOccurence = null;
                        if (!cmd.hasOption("f")) break block59;
                        switch (value2 = cmd.getOptionValue("f")) {
                            case "l": {
                                wordOccurence = StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence.LOG;
                                break block61;
                            }
                            case "u": {
                                wordOccurence = StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence.UNIQUE;
                                break block61;
                            }
                            default: {
                                LOGGER.error("Got an unkown value for the uri-type: \"" + value2 + "\"");
                                return;
                            }
                        }
                    }
                    wordOccurence = StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence.UNIQUE;
                }
                uriUsage = null;
                if (!cmd.hasOption("u")) break block62;
                switch (value = cmd.getOptionValue("u")) {
                    case "a": {
                        uriUsage = UriCountMappingCreatingDocumentSupplierDecorator.UriUsage.CLASSES_AND_PROPERTIES;
                        break block64;
                    }
                    case "c": {
                        uriUsage = UriCountMappingCreatingDocumentSupplierDecorator.UriUsage.CLASSES;
                        break block64;
                    }
                    case "p": {
                        uriUsage = UriCountMappingCreatingDocumentSupplierDecorator.UriUsage.PROPERTIES;
                        break block64;
                    }
                    default: {
                        LOGGER.error("Got an unkown value for the uri-type: \"" + value + "\"");
                        return;
                    }
                }
            }
            uriUsage = UriCountMappingCreatingDocumentSupplierDecorator.UriUsage.PROPERTIES;
        }
        MongoDBBasedTokenizedLabelRetriever mongoRetriever = null;
        WorkerBasedLabelRetrievingDocumentSupplierDecorator cachingLabelRetriever = null;
        try {
            ArrayList<TokenizedLabelRetriever> retrievers = new ArrayList<TokenizedLabelRetriever>();
            if (cmd.hasOption("h") || cmd.hasOption("p")) {
                if (cmd.hasOption("h") && cmd.hasOption("p")) {
                    mongoRetriever = MongoDBBasedTokenizedLabelRetriever.create(cmd.getOptionValue("h"), Integer.parseInt(cmd.getOptionValue("p")));
                    retrievers.add((TokenizedLabelRetriever)mongoRetriever);
                } else {
                    LOGGER.error("If one of the options h or p is defined, the other option has to be defined as well.");
                    return;
                }
            }
            if (cmd.hasOption("l")) {
                String[] stringArray = cmd.getOptionValues("l");
                int n = stringArray.length;
                int n2 = 0;
                while (n2 < n) {
                    String file = stringArray[n2];
                    LDACorpusCreation.initFileBasedRetriever(retrievers, file);
                    ++n2;
                }
            }
            if (cmd.hasOption("s")) {
                retrievers.add((TokenizedLabelRetriever)new LODCatLabelServiceBasedRetriever(cmd.getOptionValue("s")));
            }
            boolean useHttpClient = cmd.hasOption('y');
            File[] cacheFiles = null;
            if (cmd.hasOption("c")) {
                String[] fileNames = cmd.getOptionValues("c");
                cacheFiles = new File[fileNames.length];
                int i = 0;
                while (i < fileNames.length) {
                    cacheFiles[i] = new File(fileNames[i]);
                    ++i;
                }
            } else {
                cacheFiles = new File[]{};
            }
            int numberOfWorkers = -1;
            if (cmd.hasOption('w')) {
                try {
                    numberOfWorkers = Integer.parseInt(cmd.getOptionValue('w'));
                }
                catch (NumberFormatException e) {
                    throw new IllegalArgumentException("Couldn't parse given number of workers.", e);
                }
                if (numberOfWorkers < 1) {
                    throw new IllegalArgumentException("\"" + numberOfWorkers + "\" is not a valid number of workers. The numbers is expected to be >= 1.");
                }
            }
            cachingLabelRetriever = numberOfWorkers > 0 ? new WorkerBasedLabelRetrievingDocumentSupplierDecorator(null, cacheFiles, (TokenizedLabelRetriever[])retrievers.stream().filter(r -> r != null).toArray(TokenizedLabelRetriever[]::new), numberOfWorkers, useHttpClient) : new WorkerBasedLabelRetrievingDocumentSupplierDecorator(null, cacheFiles, (TokenizedLabelRetriever[])retrievers.stream().filter(r -> r != null).toArray(TokenizedLabelRetriever[]::new), useHttpClient);
            System.out.println("Starting corpus \"" + inputFile + "\" with " + uriUsage + " and " + wordOccurence);
            LDACorpusCreation corpusCreation = new LDACorpusCreation(inputFile, uriUsage, wordOccurence, outputFile);
            corpusCreation.run(cachingLabelRetriever);
        }
        finally {
            if (mongoRetriever != null) {
                try {
                    mongoRetriever.close();
                }
                catch (Exception exception) {}
            }
            if (cachingLabelRetriever != null) {
                try {
                    cachingLabelRetriever.close();
                }
                catch (Exception exception) {}
            }
        }
    }

    private static void initFileBasedRetriever(List<TokenizedLabelRetriever> retrievers, String file) {
        File f = new File(file);
        if (f.isDirectory()) {
            File[] fileArray = f.listFiles();
            int n = fileArray.length;
            int n2 = 0;
            while (n2 < n) {
                File f2 = fileArray[n2];
                LDACorpusCreation.initFileBasedRetriever(retrievers, f2.getAbsolutePath());
                ++n2;
            }
        } else {
            retrievers.add((TokenizedLabelRetriever)FileBasedTokenizedLabelRetriever.create((String)file));
        }
    }

    public LDACorpusCreation(String inputFile, UriCountMappingCreatingDocumentSupplierDecorator.UriUsage uriUsage, StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence wordOccurence, String outputFile) {
        this(inputFile, uriUsage, wordOccurence, outputFile, false);
    }

    public LDACorpusCreation(String inputFile, UriCountMappingCreatingDocumentSupplierDecorator.UriUsage uriUsage, StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence wordOccurence, String outputFile, boolean exportCorpusAsXml) {
        this.inputFile = inputFile;
        this.outputFile = outputFile;
        this.uriUsage = uriUsage;
        this.wordOccurence = wordOccurence;
        this.exportCorpusAsXml = exportCorpusAsXml;
    }

    public void run(WorkerBasedLabelRetrievingDocumentSupplierDecorator cachingLabelRetriever) throws IOException {
        XmlWritingDocumentConsumer consumer = null;
        if (this.exportCorpusAsXml) {
            consumer = XmlWritingDocumentConsumer.createXmlWritingDocumentConsumer((File)new File("./export.xml"));
        }
        Corpus corpus = this.generateCorpusAndIndexWords(cachingLabelRetriever, consumer);
        cachingLabelRetriever.storeCache();
        if (consumer != null) {
            IOUtils.closeQuietly((Closeable)consumer);
        }
        Throwable throwable = null;
        Object var5_6 = null;
        try (BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(new File(this.outputFile)));){
            GZipCorpusWriterDecorator writer = new GZipCorpusWriterDecorator((CorpusWriter)new CorpusObjectWriter());
            writer.writeCorpus(corpus, (OutputStream)out);
        }
        catch (Throwable throwable2) {
            if (throwable == null) {
                throwable = throwable2;
            } else if (throwable != throwable2) {
                throwable.addSuppressed(throwable2);
            }
            throw throwable;
        }
    }

    protected DocumentSupplier readCorpus() {
        StreamBasedXmlDocumentSupplier supplier = StreamBasedXmlDocumentSupplier.createReader((File)new File(this.inputFile), (boolean)true);
        StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetClassInfo.class);
        StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetSpecialClassesInfo.class);
        StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetPropertyInfo.class);
        StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetVocabularies.class);
        supplier = new DocumentFilteringSupplierDecorator((DocumentSupplier)supplier, new DocumentFilter(){

            public boolean isDocumentGood(Document document) {
                DocumentName name = (DocumentName)document.getProperty(DocumentName.class);
                DocumentURI uri = (DocumentURI)document.getProperty(DocumentURI.class);
                LOGGER.info("Processing of {} ({}) starts", name != null ? name.get() : "null", uri != null ? uri.get() : "null");
                return true;
            }
        });
        return supplier;
    }

    protected DocumentSupplier useWhiteListFilter(DocumentSupplier supplier) {
        if (this.inputFile.contains(".corpus")) {
            File whitelistFile = new File(this.inputFile.replace(".corpus", "_whitelist.txt"));
            if (whitelistFile.exists()) {
                try {
                    final HashSet whitelist = new HashSet(FileUtils.readLines((File)whitelistFile));
                    supplier = new DocumentFilteringSupplierDecorator(supplier, new DocumentFilter(){

                        public boolean isDocumentGood(Document document) {
                            DocumentName docName = (DocumentName)document.getProperty(DocumentName.class);
                            if (docName != null) {
                                String name = (String)docName.get();
                                if (name.endsWith(".ttl")) {
                                    name = name.substring(0, name.length() - 4);
                                }
                                DocumentURI uri = (DocumentURI)document.getProperty(DocumentURI.class);
                                return whitelist.contains(name) || uri != null && whitelist.contains(uri.get());
                            }
                            return false;
                        }
                    });
                    LOGGER.info("Using whitelistfile \"{}\".", (Object)whitelistFile);
                }
                catch (IOException e) {
                    LOGGER.error("Error while reading whitelist \"" + whitelistFile + "\".", (Throwable)e);
                }
            } else {
                LOGGER.info("Can't use whitelistfile \"{}\".", (Object)whitelistFile);
            }
        } else {
            LOGGER.info("No whitelistfile given");
        }
        return supplier;
    }

    protected DocumentSupplier generateDocuments(DocumentSupplier supplier, WorkerBasedLabelRetrievingDocumentSupplierDecorator cachingLabelRetriever) {
        supplier = this.filterUris(supplier);
        supplier = new UriCountMappingCreatingDocumentSupplierDecorator(supplier, this.uriUsage);
        cachingLabelRetriever.setDecoratedDocumentSupplier(supplier);
        supplier = cachingLabelRetriever;
        supplier = new StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator(supplier, this.wordOccurence);
        return supplier;
    }

    protected DocumentSupplier filterUris(DocumentSupplier supplier) {
        Set blacklist = VocabularyBlacklist.getInstance();
        supplier = new UriFilteringDocumentSupplierDecorator(supplier, blacklist, DatasetClassInfo.class);
        supplier = new SimpleBlankNodeRemovingDocumentSupplierDecorator(supplier, DatasetClassInfo.class);
        supplier = new UriFilteringDocumentSupplierDecorator(supplier, blacklist, DatasetPropertyInfo.class);
        supplier = new SimpleBlankNodeRemovingDocumentSupplierDecorator(supplier, DatasetPropertyInfo.class);
        supplier = new SimpleBlankNodeRemovingDocumentSupplierDecorator(supplier, DatasetSpecialClassesInfo.class);
        return supplier;
    }

    protected DocumentSupplier filterStopWordsAndEmptyDocs(DocumentSupplier supplier) {
        supplier = new SimpleTokenizedTextTermFilter(supplier, (PosTaggingTermFilter)StandardEnglishPosTaggingTermFilter.getInstance());
        supplier = new DocumentFilteringSupplierDecorator(supplier, new DocumentFilter(){

            public boolean isDocumentGood(Document document) {
                SimpleTokenizedText text = (SimpleTokenizedText)document.getProperty(SimpleTokenizedText.class);
                DocumentName name = (DocumentName)document.getProperty(DocumentName.class);
                DocumentURI uri = (DocumentURI)document.getProperty(DocumentURI.class);
                if (text != null && text.getTokens().length > 0) {
                    LOGGER.info("{} ({}) is accepted as part of the corpus", name != null ? name.get() : "null", uri != null ? uri.get() : "null");
                    return true;
                }
                LOGGER.info("{} ({}) is sorted out and won't be part of the corpus", name != null ? name.get() : "null", uri != null ? uri.get() : "null");
                return false;
            }
        });
        return supplier;
    }

    public Corpus generateCorpusAndIndexWords(WorkerBasedLabelRetrievingDocumentSupplierDecorator cachingLabelRetriever) {
        return this.generateCorpusAndIndexWords(cachingLabelRetriever, null);
    }

    public Corpus generateCorpusAndIndexWords(WorkerBasedLabelRetrievingDocumentSupplierDecorator cachingLabelRetriever, XmlWritingDocumentConsumer consumer) {
        DocumentSupplier supplier = this.generateCorpus(cachingLabelRetriever);
        SimpleVocabulary vocabulary = new SimpleVocabulary();
        supplier = new SimpleWordIndexingSupplierDecorator(supplier, (Vocabulary)vocabulary);
        supplier = new DocumentWordCountingSupplierDecorator(supplier);
        if (consumer != null) {
            supplier = new DocumentConsumerAdaptingSupplierDecorator(supplier, (DocumentConsumer)consumer);
        }
        ArrayList<Class> propertiesToRemove = new ArrayList<Class>();
        propertiesToRemove.add(DatasetVocabularies.class);
        propertiesToRemove.add(DatasetPropertyInfo.class);
        propertiesToRemove.add(DatasetSpecialClassesInfo.class);
        propertiesToRemove.add(DatasetClassInfo.class);
        propertiesToRemove.add(StringCountMapping.class);
        propertiesToRemove.add(SimpleTokenizedText.class);
        supplier = new PropertyRemovingSupplierDecorator(supplier, propertiesToRemove);
        ListCorpusCreator preprocessor = new ListCorpusCreator(supplier, new DocumentListCorpus(new ArrayList()));
        Corpus corpus = preprocessor.getCorpus();
        corpus.addProperty((CorpusProperty)new CorpusVocabulary((Vocabulary)vocabulary));
        return corpus;
    }

    public DocumentSupplier generateCorpus(WorkerBasedLabelRetrievingDocumentSupplierDecorator cachingLabelRetriever) {
        DocumentSupplier supplier = this.readCorpus();
        supplier = this.useWhiteListFilter(supplier);
        supplier = this.generateDocuments(supplier, cachingLabelRetriever);
        supplier = this.filterStopWordsAndEmptyDocs(supplier);
        return supplier;
    }
}

