/*
 * Decompiled with CFR 0.152.
 */
package org.aksw.simba.tapioca.gen;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.aksw.simba.tapioca.data.DatasetClassInfo;
import org.aksw.simba.tapioca.data.DatasetPropertyInfo;
import org.aksw.simba.tapioca.data.DatasetSpecialClassesInfo;
import org.aksw.simba.tapioca.data.DatasetVocabularies;
import org.aksw.simba.tapioca.data.SimpleTokenizedText;
import org.aksw.simba.tapioca.data.StringCountMapping;
import org.aksw.simba.tapioca.data.VocabularyBlacklist;
import org.aksw.simba.tapioca.preprocessing.SimpleBlankNodeRemovingDocumentSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.SimpleTokenizedTextTermFilter;
import org.aksw.simba.tapioca.preprocessing.SimpleWordIndexingSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.UriCountMappingCreatingDocumentSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.UriFilteringDocumentSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.labelretrieving.FileBasedTokenizedLabelRetriever;
import org.aksw.simba.tapioca.preprocessing.labelretrieving.MongoDBBasedTokenizedLabelRetriever;
import org.aksw.simba.tapioca.preprocessing.labelretrieving.TokenizedLabelRetriever;
import org.aksw.simba.tapioca.preprocessing.labelretrieving.WorkerBasedLabelRetrievingDocumentSupplierDecorator;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.dice_research.topicmodeling.io.CorpusWriter;
import org.dice_research.topicmodeling.io.gzip.GZipCorpusWriterDecorator;
import org.dice_research.topicmodeling.io.java.CorpusObjectWriter;
import org.dice_research.topicmodeling.io.xml.XmlWritingDocumentConsumer;
import org.dice_research.topicmodeling.io.xml.stream.StreamBasedXmlDocumentSupplier;
import org.dice_research.topicmodeling.lang.postagging.PosTaggingTermFilter;
import org.dice_research.topicmodeling.lang.postagging.StandardEnglishPosTaggingTermFilter;
import org.dice_research.topicmodeling.preprocessing.docconsumer.DocumentConsumer;
import org.dice_research.topicmodeling.preprocessing.docsupplier.DocumentSupplier;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.DocumentConsumerAdaptingSupplierDecorator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.DocumentFilteringSupplierDecorator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.DocumentWordCountingSupplierDecorator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.PropertyRemovingSupplierDecorator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.filter.DocumentFilter;
import org.dice_research.topicmodeling.utils.corpus.Corpus;
import org.dice_research.topicmodeling.utils.corpus.DocumentListCorpus;
import org.dice_research.topicmodeling.utils.corpus.properties.CorpusProperty;
import org.dice_research.topicmodeling.utils.corpus.properties.CorpusVocabulary;
import org.dice_research.topicmodeling.utils.doc.Document;
import org.dice_research.topicmodeling.utils.doc.DocumentName;
import org.dice_research.topicmodeling.utils.doc.DocumentURI;
import org.dice_research.topicmodeling.utils.vocabulary.SimpleVocabulary;
import org.dice_research.topicmodeling.utils.vocabulary.Vocabulary;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class StreamingLDACorpusCreation {
    private static final Logger LOGGER = LoggerFactory.getLogger(StreamingLDACorpusCreation.class);
    public static final File[] CACHE_FILES = new File[]{new File("C:/Daten/tapioca/cache/uriToLabelCache_1.object"), new File("C:/Daten/tapioca/cache/uriToLabelCache_2.object"), new File("C:/Daten/tapioca/cache/uriToLabelCache_3.object")};
    @Deprecated
    public static final String CORPUS_NAME = "lodDiagram";
    @Deprecated
    public static final String CORPUS_FILE = "/Daten/tapioca/lodDiagram.corpus";
    protected final String inputFile;
    protected final String outputFile;
    protected final UriCountMappingCreatingDocumentSupplierDecorator.UriUsage uriUsage;
    protected final StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence wordOccurence;
    protected final boolean exportCorpusAsXml;

    public static void main(String[] args) {
        Options options = new Options();
        options.addOption("n", "input-file", true, "the input corpus file");
        options.addOption("o", "output-file", true, "the output corpus file");
        options.addOption("l", "label-file", true, "a label file that should be used to retrieve labels");
        options.addOption("c", "cache-file", true, "a cache file that can be used to cache labels retrieved via HTTP");
        options.addOption("h", "mongo-db-host", true, "the host name of a MongoDB instance containing URI to label mappings");
        options.addOption("p", "mongo-db-port", true, "the port of a MongoDB instance containing URI to label mappings");
        options.addOption("f", "fast", false, "the document processing is done in parallel");
        options.addOption("x", "export-xml", false, "export the corpus as XML");
        DefaultParser parser = new DefaultParser();
        CommandLine cmd = null;
        try {
            cmd = parser.parse(options, args);
        }
        catch (ParseException e) {
            LOGGER.error("Couldn't parse commands. Aborting.", (Throwable)e);
            return;
        }
        if (!cmd.hasOption("n")) {
            LOGGER.error("Input file is not defined. Please provide an input file.");
            return;
        }
        String inputFile = cmd.getOptionValue("n");
        if (!cmd.hasOption("o")) {
            LOGGER.error("Output file is not defined. Please provide an Output file.");
            return;
        }
        String outputFile = cmd.getOptionValue("o");
        UriCountMappingCreatingDocumentSupplierDecorator.UriUsage[] uriUsages = new UriCountMappingCreatingDocumentSupplierDecorator.UriUsage[]{UriCountMappingCreatingDocumentSupplierDecorator.UriUsage.CLASSES_AND_PROPERTIES};
        StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence[] wordOccurences = new StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence[]{StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence.LOG};
        MongoDBBasedTokenizedLabelRetriever mongoRetriever = null;
        WorkerBasedLabelRetrievingDocumentSupplierDecorator cachingLabelRetriever = null;
        try {
            int i;
            ArrayList<MongoDBBasedTokenizedLabelRetriever> retrievers = new ArrayList<MongoDBBasedTokenizedLabelRetriever>();
            if (cmd.hasOption("h") || cmd.hasOption("p")) {
                if (cmd.hasOption("h") && cmd.hasOption("p")) {
                    mongoRetriever = MongoDBBasedTokenizedLabelRetriever.create(cmd.getOptionValue("h"), Integer.parseInt(cmd.getOptionValue("p")));
                    retrievers.add(mongoRetriever);
                } else {
                    LOGGER.error("If one of the options h or p is defined, the other option has to be defined as well.");
                    return;
                }
            }
            if (cmd.hasOption("l")) {
                String[] stringArray = cmd.getOptionValues("l");
                int n = stringArray.length;
                int n2 = 0;
                while (n2 < n) {
                    String file = stringArray[n2];
                    retrievers.add((MongoDBBasedTokenizedLabelRetriever)FileBasedTokenizedLabelRetriever.create((String)file));
                    ++n2;
                }
            }
            File[] cacheFiles = null;
            if (cmd.hasOption("c")) {
                String[] fileNames = cmd.getOptionValues("c");
                cacheFiles = new File[fileNames.length];
                i = 0;
                while (i < fileNames.length) {
                    cacheFiles[i] = new File(fileNames[i]);
                    ++i;
                }
            } else {
                cacheFiles = new File[]{};
            }
            cachingLabelRetriever = new WorkerBasedLabelRetrievingDocumentSupplierDecorator(null, cacheFiles, (TokenizedLabelRetriever[])retrievers.stream().filter(r -> r != null).toArray(TokenizedLabelRetriever[]::new));
            i = 0;
            while (i < uriUsages.length) {
                int j = 0;
                while (j < wordOccurences.length) {
                    System.out.println("Starting corpus \"" + inputFile + "\" with " + uriUsages[i] + " and " + wordOccurences[j]);
                    StreamingLDACorpusCreation corpusCreation = new StreamingLDACorpusCreation(inputFile, uriUsages[i], wordOccurences[j], outputFile);
                    corpusCreation.run(cachingLabelRetriever, cmd.hasOption('f'));
                    ++j;
                }
                ++i;
            }
        }
        finally {
            if (mongoRetriever != null) {
                try {
                    mongoRetriever.close();
                }
                catch (Exception exception) {}
            }
            if (cachingLabelRetriever != null) {
                try {
                    cachingLabelRetriever.close();
                }
                catch (Exception exception) {}
            }
        }
    }

    public StreamingLDACorpusCreation(String inputFile, UriCountMappingCreatingDocumentSupplierDecorator.UriUsage uriUsage, StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence wordOccurence, String outputFile) {
        this(inputFile, uriUsage, wordOccurence, outputFile, false);
    }

    public StreamingLDACorpusCreation(String inputFile, UriCountMappingCreatingDocumentSupplierDecorator.UriUsage uriUsage, StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator.WordOccurence wordOccurence, String outputFile, boolean exportCorpusAsXml) {
        this.inputFile = inputFile;
        this.outputFile = outputFile;
        this.uriUsage = uriUsage;
        this.wordOccurence = wordOccurence;
        this.exportCorpusAsXml = exportCorpusAsXml;
    }

    public void run(WorkerBasedLabelRetrievingDocumentSupplierDecorator cachingLabelRetriever, boolean isParallel) {
        XmlWritingDocumentConsumer consumer = null;
        if (this.exportCorpusAsXml) {
            consumer = XmlWritingDocumentConsumer.createXmlWritingDocumentConsumer((File)new File("./export.xml"));
        }
        Corpus corpus = this.generateCorpusAndIndexWords(cachingLabelRetriever, consumer, isParallel);
        cachingLabelRetriever.storeCache();
        if (consumer != null) {
            IOUtils.closeQuietly((Closeable)consumer);
        }
        GZipCorpusWriterDecorator writer = new GZipCorpusWriterDecorator((CorpusWriter)new CorpusObjectWriter());
        try {
            writer.writeCorpus(corpus, new File(this.outputFile));
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    protected Stream<Document> readCorpus() {
        StreamBasedXmlDocumentSupplier supplier = StreamBasedXmlDocumentSupplier.createReader((File)new File(this.inputFile), (boolean)true);
        StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetClassInfo.class);
        StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetSpecialClassesInfo.class);
        StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetPropertyInfo.class);
        StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetVocabularies.class);
        supplier = new DocumentFilteringSupplierDecorator((DocumentSupplier)supplier, new DocumentFilter(){

            public boolean isDocumentGood(Document document) {
                DocumentName name = (DocumentName)document.getProperty(DocumentName.class);
                DocumentURI uri = (DocumentURI)document.getProperty(DocumentURI.class);
                LOGGER.info("Processing of {} ({}) starts", name != null ? name.get() : "null", uri != null ? uri.get() : "null");
                return true;
            }
        });
        return DocumentSupplier.convertToStream((DocumentSupplier)supplier);
    }

    protected Stream<Document> useWhiteListFilter(Stream<Document> docStream) {
        File whitelistFile = new File(this.inputFile.replace(".corpus", "_whitelist.txt"));
        if (whitelistFile.exists()) {
            try {
                final HashSet whitelist = new HashSet(FileUtils.readLines((File)whitelistFile));
                docStream = docStream.filter((Predicate<Document>)new DocumentFilter(){

                    public boolean isDocumentGood(Document document) {
                        DocumentName docName = (DocumentName)document.getProperty(DocumentName.class);
                        if (docName != null) {
                            String name = (String)docName.get();
                            if (name.endsWith(".ttl")) {
                                name = name.substring(0, name.length() - 4);
                            }
                            DocumentURI uri = (DocumentURI)document.getProperty(DocumentURI.class);
                            return whitelist.contains(name) || uri != null && whitelist.contains(uri.get());
                        }
                        return false;
                    }
                });
                LOGGER.info("Using whitelistfile \"{}\".", (Object)whitelistFile);
            }
            catch (IOException e) {
                LOGGER.error("Error while reading whitelist \"" + whitelistFile + "\".", (Throwable)e);
            }
        } else {
            LOGGER.info("Can't use whitelistfile \"{}\".", (Object)whitelistFile);
        }
        return docStream;
    }

    protected Stream<Document> generateDocuments(Stream<Document> docStream, WorkerBasedLabelRetrievingDocumentSupplierDecorator cachingLabelRetriever) {
        docStream = this.filterUris(docStream);
        docStream = docStream.map(new UriCountMappingCreatingDocumentSupplierDecorator(null, this.uriUsage));
        docStream = docStream.map(cachingLabelRetriever);
        docStream = docStream.map(new StringCountToSimpleTokenizedTextConvertingDocumentSupplierDecorator(null, this.wordOccurence));
        return docStream;
    }

    protected Stream<Document> filterUris(Stream<Document> docStream) {
        Set blacklist = VocabularyBlacklist.getInstance();
        return docStream.map(new UriFilteringDocumentSupplierDecorator(null, blacklist, DatasetClassInfo.class)).map(new SimpleBlankNodeRemovingDocumentSupplierDecorator(null, DatasetClassInfo.class)).map(new UriFilteringDocumentSupplierDecorator(null, blacklist, DatasetPropertyInfo.class)).map(new SimpleBlankNodeRemovingDocumentSupplierDecorator(null, DatasetPropertyInfo.class)).map(new SimpleBlankNodeRemovingDocumentSupplierDecorator(null, DatasetSpecialClassesInfo.class));
    }

    protected Stream<Document> filterStopWordsAndEmptyDocs(Stream<Document> docStream) {
        docStream = docStream.map(new SimpleTokenizedTextTermFilter(null, (PosTaggingTermFilter)StandardEnglishPosTaggingTermFilter.getInstance()));
        docStream = docStream.filter((Predicate<Document>)new DocumentFilter(){

            public boolean isDocumentGood(Document document) {
                SimpleTokenizedText text = (SimpleTokenizedText)document.getProperty(SimpleTokenizedText.class);
                DocumentName name = (DocumentName)document.getProperty(DocumentName.class);
                DocumentURI uri = (DocumentURI)document.getProperty(DocumentURI.class);
                if (text != null && text.getTokens().length > 0) {
                    LOGGER.info("{} ({}) is accepted as part of the corpus", name != null ? name.get() : "null", uri != null ? uri.get() : "null");
                    return true;
                }
                LOGGER.info("{} ({}) is sorted out and won't be part of the corpus", name != null ? name.get() : "null", uri != null ? uri.get() : "null");
                return false;
            }
        });
        return docStream;
    }

    public Corpus generateCorpusAndIndexWords(WorkerBasedLabelRetrievingDocumentSupplierDecorator cachingLabelRetriever, boolean isParallel) {
        return this.generateCorpusAndIndexWords(cachingLabelRetriever, null, isParallel);
    }

    public Corpus generateCorpusAndIndexWords(WorkerBasedLabelRetrievingDocumentSupplierDecorator cachingLabelRetriever, XmlWritingDocumentConsumer consumer, boolean isParallel) {
        Stream<Object> docStream = this.generateCorpus(cachingLabelRetriever, isParallel);
        SimpleVocabulary vocabulary = new SimpleVocabulary();
        docStream = docStream.map(new SimpleWordIndexingSupplierDecorator(null, (Vocabulary)vocabulary)).map(new DocumentWordCountingSupplierDecorator(null));
        if (consumer != null) {
            docStream = docStream.map(new DocumentConsumerAdaptingSupplierDecorator(null, (DocumentConsumer)consumer, true));
        }
        ArrayList<Class> propertiesToRemove = new ArrayList<Class>();
        propertiesToRemove.add(DatasetVocabularies.class);
        propertiesToRemove.add(DatasetPropertyInfo.class);
        propertiesToRemove.add(DatasetSpecialClassesInfo.class);
        propertiesToRemove.add(DatasetClassInfo.class);
        propertiesToRemove.add(StringCountMapping.class);
        propertiesToRemove.add(SimpleTokenizedText.class);
        docStream = docStream.map(new PropertyRemovingSupplierDecorator(null, propertiesToRemove));
        DocumentListCorpus corpus = new DocumentListCorpus(docStream.collect(Collectors.toList()));
        corpus.addProperty((CorpusProperty)new CorpusVocabulary((Vocabulary)vocabulary));
        return corpus;
    }

    public Stream<Document> generateCorpus(WorkerBasedLabelRetrievingDocumentSupplierDecorator cachingLabelRetriever, boolean isParallel) {
        Stream<Document> docStream = this.readCorpus();
        if (isParallel) {
            docStream = (Stream<Document>)docStream.parallel();
        }
        docStream = this.useWhiteListFilter(docStream);
        docStream = this.generateDocuments(docStream, cachingLabelRetriever);
        docStream = this.filterStopWordsAndEmptyDocs(docStream);
        return docStream;
    }
}

