package org.dice_research.glisten;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.Writer;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.function.Consumer;
import java.util.function.Predicate;
import java.util.zip.GZIPInputStream;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.jena.graph.Node;
import org.apache.jena.graph.Triple;
import org.apache.jena.riot.Lang;
import org.apache.jena.riot.RDFDataMgr;
import org.apache.jena.riot.system.StreamRDF;
import org.apache.jena.riot.system.StreamRDF2;
import org.apache.jena.riot.system.StreamRDFBase;
import org.apache.jena.riot.system.StreamRDFLib;
import org.apache.jena.sparql.core.Quad;
import org.apache.jena.vocabulary.RDF;
import org.apache.jena.vocabulary.RDFS;

/**
 * A class that generates a dump file for a give DBpedia class.
 * 
 * @author Michael R&ouml;der (michael.roeder@uni-paderborn.de)
 *
 */
public class DBpediaDumpCreator {

//    private static final String INPUT_DIRECTORY = "/home/micha/data/dbpedia-dump-08-2021/";
//    private static final String OUTPUT_DIRECTORY = "/home/micha/data/glisten/dbpedia/";
//    private static final String TYPE_FILE = "/home/micha/data/dbpedia-dump-08-2021/instance-types.combined.ttl";
//    private static final String ONTOLOGY_FILE = "/home/micha/data/dbpedia-dump-08-2021/ontology--DEV_type=parsed.nt";

    private static final String DBO_NAMEPSACE = "http://dbpedia.org/ontology/";
    private static final String DBP_NAMEPSACE = "http://dbpedia.org/properties/";
    private static final String DBR_NAMEPSACE = "http://dbpedia.org/resource/";
    private static final String RDF_TYPE_URI = RDF.type.getURI();
    private static final String RDFS_LABEL_URI = RDFS.label.getURI();

    private static final int DBO_NAMEPSACE_LENGTH = DBO_NAMEPSACE.length();
    private static final int DBP_NAMEPSACE_LENGTH = DBP_NAMEPSACE.length();
    private static final int DBR_NAMEPSACE_LENGTH = DBR_NAMEPSACE.length();

    private static final String EXPECTED_LANG_TAG = "en";

    public static void main(String[] args) throws FileNotFoundException, IOException {
        if (args.length < 5) {
            System.err.println(
                    "Argument missing! Usage:\nDBpediaDumpCreator <input-dir> <type-file> <ontology-file> <properties-file> <output-dir> <class-name> [<class-name>]");
        }
        System.out.println(Arrays.toString(args));
        String inputDir = args[0].endsWith("/") ? args[0] : args[0] + '/';
        String typeFile = args[1];
        String ontologyFile = args[2];
        String propertiesFile = args[3];
        String outputDir = args[4].endsWith("/") ? args[4] : args[4] + '/';
        for (int i = 5; i < args.length; ++i) {
            createDump(inputDir, typeFile, ontologyFile, propertiesFile, outputDir, args[i]);
        }
    }

    private static void createDump(String inputDir, String typeFile, String ontologyFile, String propertiesFile,
            String outputDir, String clazz) throws FileNotFoundException, IOException {
        System.out.println("Starting with class " + clazz + " ...");
        // Get all instances of the given class
        Set<String> instances = collectInstances(typeFile, clazz);
        if(instances.size() == 0) {
            System.out.println("Found no instances. Aborting.");
            return;
        } else {
            System.out.println("Found " + instances.size() + " instances.");
        }
        // Start writing the output
        File outputFile = new File(outputDir + clazz + ".nt");
        Set<String> dboInstances = new HashSet<>();
        Set<String> dbpInstances = new HashSet<>();
        Set<String> dbrInstances = new HashSet<>();
        try (Writer out = new FileWriter(outputFile)) {
            StreamRDF writeStream = StreamRDFLib.writer(out);
            writeStream.start();
            // Write the CBDs of the instances and collect related dbo, dbp and dbr URIs
            writeCBDs(inputDir, clazz, instances, dboInstances, dbpInstances, dbrInstances, writeStream);
            // Add classes of connected DBR resources
            addClasses(typeFile, dboInstances, dbrInstances, writeStream);
            // Add labels of the classes and properties
            writeOntologyLabels(ontologyFile, dboInstances, DBO_NAMEPSACE, writeStream);
            writeOntologyLabels(propertiesFile, dbpInstances, DBP_NAMEPSACE, writeStream);
            writeStream.finish();
        }
    }

    private static Set<String> collectInstances(String typeFile, String clazz)
            throws FileNotFoundException, IOException {
        final String classUri = DBO_NAMEPSACE + clazz;
        final Set<String> instances = new HashSet<>();
        ProcessHandler ph = grepFromFile(typeFile, classUri);
        try (InputStream is = ph.getInputStream()) {
            StreamRDF instanceCollector = new SimpleTripleHandler(
                    (t) -> RDF_TYPE_URI.equals(t.getPredicate().getURI()) && classUri.equals(t.getObject().getURI()),
                    (t) -> instances.add(new String(t.getSubject().getURI().substring(DBR_NAMEPSACE_LENGTH))));
            RDFDataMgr.parse(instanceCollector, is, Lang.TURTLE);
        }
        if (!ph.exitedCleanly()) {
            throw new IllegalStateException("Error while reading types of class " + clazz);
        }
        return instances;
    }

    private static void writeCBDs(String inputDir, String clazz, Set<String> instances, Set<String> dboInstances,
            Set<String> dbpInstances, Set<String> dbrInstances, StreamRDF writeStream)
            throws FileNotFoundException, IOException {
        File[] inputFiles = (new File(inputDir)).listFiles((f) -> f.getName().endsWith(".ttl.bz2")
                || f.getName().endsWith(".ttl.gz") || f.getName().endsWith(".ttl"));

        StreamRDF dboPropCollector = new SimpleTripleHandler((t) -> t.getPredicate().getURI().startsWith(DBO_NAMEPSACE),
                (t) -> dboInstances.add(new String(t.getPredicate().getURI().substring(DBO_NAMEPSACE_LENGTH))));
        StreamRDF dboObjCollector = new SimpleTripleHandler(
                (t) -> t.getObject().isURI() && t.getObject().getURI().startsWith(DBO_NAMEPSACE),
                (t) -> dboInstances.add(new String(t.getObject().getURI().substring(DBO_NAMEPSACE_LENGTH))));

        StreamRDF dbpPropCollector = new SimpleTripleHandler((t) -> t.getPredicate().getURI().startsWith(DBP_NAMEPSACE),
                (t) -> dbpInstances.add(new String(t.getPredicate().getURI().substring(DBP_NAMEPSACE_LENGTH))));
        StreamRDF dbpObjCollector = new SimpleTripleHandler(
                (t) -> t.getObject().isURI() && t.getObject().getURI().startsWith(DBP_NAMEPSACE),
                (t) -> dbpInstances.add(new String(t.getObject().getURI().substring(DBP_NAMEPSACE_LENGTH))));

        StreamRDF dbrCollector = new SimpleTripleHandler(
                (t) -> t.getObject().isURI() && t.getObject().getURI().startsWith(DBR_NAMEPSACE)
                        && !instances.contains(t.getObject().getURI().substring(DBR_NAMEPSACE_LENGTH)),
                (t) -> dbrInstances.add(new String(t.getObject().getURI().substring(DBR_NAMEPSACE_LENGTH))));

        StreamRDF stream = writeStream; // Finally, we want to write the triples
        stream = new StreamRDF2(stream, dbrCollector);
        stream = new StreamRDF2(stream, dbpObjCollector);
        stream = new StreamRDF2(stream, dbpPropCollector);
        stream = new StreamRDF2(stream, dboObjCollector);
        stream = new StreamRDF2(stream, dboPropCollector);

        // Define filter (we only want triples which have an instance from the given set
        // as subject)
        stream = new SimpleStreamFilter((t) -> t.getSubject().getURI().startsWith(DBR_NAMEPSACE)
                && instances.contains(t.getSubject().getURI().substring(DBR_NAMEPSACE_LENGTH))
                && checkObjectLang(t, EXPECTED_LANG_TAG), stream);

        for (int i = 0; i < inputFiles.length; ++i) {
            System.out.print("Reading " + inputFiles[i].getName() + " ...");
            long time = System.currentTimeMillis();
            ProcessHandler ph = grepFromFile(inputFiles[i].getAbsolutePath(), DBR_NAMEPSACE, instances);
            try (InputStream is = ph.getInputStream()) {
                RDFDataMgr.parse(stream, is, Lang.TURTLE);
            }
            if (!ph.exitedCleanly()) {
                throw new IllegalStateException("Error while reading file " + inputFiles[i].getName());
            }
            time = System.currentTimeMillis() - time;
            System.out.print(" done (");
            System.out.print(time);
            System.out.println("ms)");
        }
    }

    private static void addClasses(String typeFile, Set<String> dboInstances, Set<String> dbrInstances,
            StreamRDF writeStream) throws FileNotFoundException, IOException {
        System.out.print("Adding classes for " + dbrInstances.size() + " additional instances ...");
        long time = System.currentTimeMillis();

        StreamRDF dboObjCollector = new SimpleTripleHandler((t) -> t.getObject().getURI().startsWith(DBO_NAMEPSACE),
                (t) -> dboInstances.add(new String(t.getObject().getURI().substring(DBO_NAMEPSACE_LENGTH))));

        StreamRDF stream = writeStream; // Finally, we want to write the triples
        stream = new StreamRDF2(stream, dboObjCollector);
        // Define filter (we only want triples with classes of IRIs in the dbrInstances
        // set)
        stream = new SimpleStreamFilter((t) -> RDF_TYPE_URI.equals(t.getPredicate().getURI())
                && t.getSubject().getURI().startsWith(DBR_NAMEPSACE)
                && dbrInstances.contains(t.getSubject().getURI().substring(DBR_NAMEPSACE_LENGTH)), stream);

        ProcessHandler ph = grepFromFile(typeFile, DBR_NAMEPSACE, dbrInstances);
        try (InputStream is = ph.getInputStream()) {
            RDFDataMgr.parse(stream, is, Lang.TURTLE);
        }
        if (!ph.exitedCleanly()) {
            throw new IllegalStateException("Error while reading file types of additional instances.");
        }
        time = System.currentTimeMillis() - time;
        System.out.print(" done (");
        System.out.print(time);
        System.out.println("ms)");
    }

    private static void writeOntologyLabels(String ontologyFile, Set<String> ontologyIris, String ontologyNamespace,
            StreamRDF writeStream) throws FileNotFoundException, IOException {
        System.out.print("Adding labels of " + ontologyIris.size() + " ontology IRIs ...");
        long time = System.currentTimeMillis();

        final int namespaceLength = ontologyNamespace.length();
        StreamRDF stream = writeStream; // Finally, we want to write the triples
        // Define filter (we only want labels of the DBO and DBP classes)
        stream = new SimpleStreamFilter((t) -> RDFS_LABEL_URI.equals(t.getPredicate().getURI())
                && t.getSubject().getURI().startsWith(ontologyNamespace)
                && ontologyIris.contains(t.getSubject().getURI().substring(namespaceLength))
                && checkObjectLang(t, EXPECTED_LANG_TAG), stream);
        try (InputStream is = openFile(ontologyFile)) {
            RDFDataMgr.parse(stream, is, Lang.NT);
        }
        time = System.currentTimeMillis() - time;
        System.out.print(" done (");
        System.out.print(time);
        System.out.println("ms)");
    }

    protected static InputStream openFile(String file) throws IOException {
        InputStream is = new BufferedInputStream(new FileInputStream(file));
        if (file.endsWith(".bz2")) {
            is = new BZip2CompressorInputStream(is);
        } else if (file.endsWith(".gz")) {
            is = new GZIPInputStream(is);
        }
        return is;
    }

//    protected static InputStream grepFromFile(String file, String searchString) throws IOException {
//        String grep = file.endsWith(".bz2") ? "bzfgrep" : "fgrep";
//        Process process = new ProcessBuilder(grep, searchString, file).start();
//        return process.getInputStream();
//    }

    protected static ProcessHandler grepFromFile(String file, String searchString) throws IOException {
        String grep = file.endsWith(".bz2") ? "bzfgrep" : "fgrep";
//        System.out.println("Calling " + grep + " " + searchString + " " + file);
        return ProcessHandler.create(grep, searchString, file);
    }

    protected static ProcessHandler grepFromFile(String file, Collection<String> searchStrings) throws IOException {
        return grepFromFile(file, "", searchStrings);
    }

    protected static ProcessHandler grepFromFile(String file, String searchPrefix, Collection<String> searchStrings)
            throws IOException {
        if (searchStrings.size() < 1000) {
            return grepFromFileWithLongCmd(file, searchPrefix, searchStrings);
        } else {
            return grepFromFileWithTempFile(file, searchPrefix, searchStrings);
        }
    }

    protected static ProcessHandler grepFromFileWithLongCmd(String file, String searchPrefix,
            Collection<String> searchStrings) throws IOException {
        StringBuilder builder = new StringBuilder();
        boolean first = true;
        for (String searchString : searchStrings) {
            if (first) {
                builder.append('"');
                first = false;
            } else {
                builder.append('\n');
            }
            builder.append(searchPrefix);
            builder.append(searchString);
        }
        builder.append('"');
        return grepFromFile(file, builder.toString());
    }

    protected static ProcessHandler grepFromFileWithTempFile(String file, String searchPrefix,
            Collection<String> searchStrings) throws IOException {
        File tempFile = File.createTempFile("grep", ".txt");
        tempFile.deleteOnExit();
        try (PrintStream out = new PrintStream(tempFile)) {
            for (String searchString : searchStrings) {
                out.println(searchString);
            }
        }
        String grep = file.endsWith(".bz2") ? "bzfgrep" : "fgrep";
        return ProcessHandler.create(grep, "-f", tempFile.getAbsolutePath(), file);
    }

    /**
     * Checks the language tag of the object of the given triple. Returns true if
     * (1) the object is not a literal, (2) the object has no language tag or (3)
     * the language tag matches the given language tag.
     * 
     * @param t            the triple that should be checked
     * @param expectedLang the expected language tag
     * @return true if the language tag matches the given tag or there is no
     *         language tag
     */
    protected static boolean checkObjectLang(Triple t, String expectedLang) {
        Node object = t.getObject();
        if (!object.isLiteral()) {
            return true;
        }
        String lang = object.getLiteralLanguage();
        return lang.isEmpty() || expectedLang.equals(lang);
    }

    /**
     * A simple handler for a stream of triples. Each triple for which the given
     * check returns true is sent to the given consumer. All other triples are
     * ignored.
     * 
     * @author Michael R&ouml;der (michael.roeder@uni-paderborn.de)
     */
    public static class SimpleTripleHandler extends StreamRDFBase {

        private Predicate<Triple> check;
        private Consumer<Triple> consumer;

        public SimpleTripleHandler(Predicate<Triple> check, Consumer<Triple> consumer) {
            this.check = check;
            this.consumer = consumer;
        }

        @Override
        public void triple(Triple triple) {
            if (check.test(triple)) {
                consumer.accept(triple);
            }
        }

        @Override
        public void quad(Quad quad) {
            triple(quad.asTriple());
        }
    }

    /**
     * A simple filter that forwards triples to the given stream.
     * 
     * @author Michael R&ouml;der (michael.roeder@uni-paderborn.de)
     *
     */
    public static class SimpleStreamFilter extends SimpleTripleHandler {

        public SimpleStreamFilter(Predicate<Triple> check, StreamRDF consumer) {
            super(check, (t) -> consumer.triple(t));
        }
    }

    /**
     * A simple class that wraps parts of the Process handling.
     * 
     * @author Michael R&ouml;der (michael.roeder@uni-paderborn.de)
     *
     */
    public static class ProcessHandler {

        private Process process;
        private Thread errorThread;

        public ProcessHandler(Process process, Thread errorThread) {
            this.process = process;
            this.errorThread = errorThread;
        }

        public boolean exitedCleanly() {
            try {
                process.waitFor();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            try {
                errorThread.join(2000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
            if (errorThread.isAlive()) {
                System.err.println("Warning, an error thread is still running.");
            }
            // grep terminates with 0 if a line has been found, 1 if no match has been found
            // and 2 if an error occurred
            return process.exitValue() == 0 || process.exitValue() == 1;
        }

        public InputStream getInputStream() {
            return process.getInputStream();
        }

        public static ProcessHandler create(String... commands) throws IOException {
            return create(Arrays.asList(commands));
        }

        public static ProcessHandler create(List<String> commands) throws IOException {
            final Process process = new ProcessBuilder(commands).start();
            Thread errorThread = new Thread(new Runnable() {

                @Override
                public void run() {
                    try {
                        IOUtils.copy(process.getErrorStream(), System.err);
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
//                    System.out.println("Thread is done");
                }
            });
            errorThread.start();
            return new ProcessHandler(process, errorThread);
        }
    }
}
