/**
 * tapioca.modelgen - ${project.description}
 * Copyright © 2015 Data Science Group (DICE) (michael.roeder@uni-paderborn.de)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
/**
 * This file is part of tapioca.modelgen.
 *
 * tapioca.modelgen is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * tapioca.modelgen is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with tapioca.modelgen.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.aksw.simba.tapioca.gen;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

import org.aksw.simba.tapioca.data.DatasetClassInfo;
import org.aksw.simba.tapioca.data.DatasetPropertyInfo;
import org.aksw.simba.tapioca.data.DatasetSpecialClassesInfo;
import org.aksw.simba.tapioca.data.DatasetVocabularies;
import org.aksw.simba.tapioca.data.SimpleTokenizedText;
import org.aksw.simba.tapioca.data.StringCountMapping;
import org.aksw.simba.tapioca.gen.data.DatasetURIs;
import org.aksw.simba.tapioca.gen.preprocessing.DatasetURIsSummarizingSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.UriCountMappingCreatingDocumentSupplierDecorator;
import org.aksw.simba.tapioca.preprocessing.UriCountMappingCreatingDocumentSupplierDecorator.UriUsage;
import org.dice_research.topicmodeling.io.gzip.GZipCorpusObjectWriter;
import org.dice_research.topicmodeling.io.java.CorpusObjectWriter;
import org.dice_research.topicmodeling.io.xml.stream.StreamBasedXmlDocumentSupplier;
import org.dice_research.topicmodeling.preprocessing.ListCorpusCreator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.DocumentSupplier;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.DocumentFilteringSupplierDecorator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.PropertyRemovingSupplierDecorator;
import org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.filter.DocumentFilter;
import org.dice_research.topicmodeling.utils.corpus.DocumentListCorpus;
import org.dice_research.topicmodeling.utils.doc.Document;
import org.dice_research.topicmodeling.utils.doc.DocumentProperty;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class URIBasedIndexGenerator {

	private static final Logger LOGGER = LoggerFactory.getLogger(URIBasedIndexGenerator.class);

	public static final String BL_CORPUS_FILE = TMBasedIndexGenerator.TAPIOCA_FOLDER + TMBasedIndexGenerator.CORPUS_NAME
			+ "_BL.object";

	public static final String FINAL_CORPUS_FILE = TMBasedIndexGenerator.CORPUS_NAME + "_BL_final.corpus";

	public static void main(String[] args) {
		URIBasedIndexGenerator generator = new URIBasedIndexGenerator();
		generator.run();
	}

	public void run() {
		File outputFolder = new File(TMBasedIndexGenerator.OUTPUT_FOLDER);
		if (!outputFolder.exists()) {
			outputFolder.mkdirs();
		}

		File datasetDescriptionsFile = new File(
				TMBasedIndexGenerator.OUTPUT_FOLDER + File.separator + FINAL_CORPUS_FILE);
		if (datasetDescriptionsFile.exists()) {
			LOGGER.info("The final corpus file is already existing.");
		} else {
			generateFinalCorpusFile();
		}
	}

	protected void generateFinalCorpusFile() {
		if (checkBLCorpusExistence()) {
			MetaDataInformationCollector collector = new MetaDataInformationCollector();
			LOGGER.info("Generating final corpus file...");
			collector.run(TMBasedIndexGenerator.META_DATA_FILE, BL_CORPUS_FILE, TMBasedIndexGenerator.STAT_RESULT_FILE,
					TMBasedIndexGenerator.OUTPUT_FOLDER + File.separator + FINAL_CORPUS_FILE,
					TMBasedIndexGenerator.OUTPUT_FOLDER + File.separator + TMBasedIndexGenerator.MODEL_META_DATA_FILE);
		}
	}

	protected boolean checkBLCorpusExistence() {
		File blCorpusFile = new File(BL_CORPUS_FILE);
		if (!blCorpusFile.exists()) {
			LOGGER.warn("The BL corpus file is not existing. Trying to generate it...");
			generateBLCorpusFile();
			if (!blCorpusFile.exists()) {
				LOGGER.error("The BL corpus file is not existing and couldn't be generated.");
				return false;
			}
		}
		return true;
	}

	public static DocumentSupplier createBLPreprocessing(File inputFile) {
		DocumentSupplier supplier = StreamBasedXmlDocumentSupplier.createReader(inputFile, true);
		StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetClassInfo.class);
		StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetSpecialClassesInfo.class);
		StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetPropertyInfo.class);
		StreamBasedXmlDocumentSupplier.registerParseableDocumentProperty(DatasetVocabularies.class);
		// Count the URIs
		supplier = new UriCountMappingCreatingDocumentSupplierDecorator(supplier, UriUsage.CLASSES_AND_PROPERTIES);

		supplier = new DatasetURIsSummarizingSupplierDecorator(supplier);

		supplier = new DocumentFilteringSupplierDecorator(supplier, new DocumentFilter() {
			public boolean isDocumentGood(Document document) {
				DatasetURIs uris = document.getProperty(DatasetURIs.class);
				return (uris != null) && (uris.get().size() > 0);
			}
		});

		// final Set<String> whiteList = generateDocumentNameWhiteList();
		// if (whiteList != null) {
		// supplier = new DocumentFilteringSupplierDecorator(supplier, new
		// DocumentFilter() {
		// public boolean isDocumentGood(Document document) {
		// DocumentName name = document.getProperty(DocumentName.class);
		// return (name != null) && (whiteList.contains(name.get()));
		// }
		// });
		// }

		return supplier;
	}

	protected void generateBLCorpusFile() {
		DocumentSupplier supplier = createBLPreprocessing(new File(TMBasedIndexGenerator.CORPUS_FILE));

		// Since this property is not serializeable we have to remove it
		List<Class<? extends DocumentProperty>> propertiesToRemove = new ArrayList<Class<? extends DocumentProperty>>();
		propertiesToRemove.add(DatasetVocabularies.class);
		propertiesToRemove.add(DatasetPropertyInfo.class);
		propertiesToRemove.add(DatasetSpecialClassesInfo.class);
		propertiesToRemove.add(DatasetClassInfo.class);
		propertiesToRemove.add(StringCountMapping.class);
		propertiesToRemove.add(SimpleTokenizedText.class);
		supplier = new PropertyRemovingSupplierDecorator(supplier, propertiesToRemove);

		ListCorpusCreator<List<Document>> preprocessor = new ListCorpusCreator<List<Document>>(supplier,
				new DocumentListCorpus<List<Document>>(new ArrayList<Document>()));

		CorpusObjectWriter writer = new GZipCorpusObjectWriter(new File(BL_CORPUS_FILE));
		writer.writeCorpus(preprocessor.getCorpus());
	}

//    protected Set<String> generateDocumentNameWhiteList() {
//        File finalLDACorpusFile = new File(TMBasedIndexGenerator.LDA_CORPUS_FILE);
//        if (!finalLDACorpusFile.exists()) {
//            LOGGER.info("The LDA corpus file is not existing. Can't use it as white list.");
//            return null;
//        }
//        // GZipCorpusObjectReader reader = new
//        // GZipCorpusObjectReader(finalLDACorpusFile);
//        // Corpus finalLDACorpus = reader.getCorpus();
//        Set<String> whiteList = new HashSet<String>();
//        DocumentName name;
//        DocumentSupplier supplier = StreamBasedXmlDocumentSupplier.createReader(new File(
//                TMBasedIndexGenerator.CORPUS_FILE), true);
//        Document document = supplier.getNextDocument();
//        while (document != null) {
//            name = document.getProperty(DocumentName.class);
//            if (name != null) {
//                whiteList.add(name.get());
//            }
//            document = supplier.getNextDocument();
//        }
//        return whiteList;
//    }
}
