package org.aksw.simba.tapioca.gen;

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;

import org.aksw.simba.tapioca.gen.LaundromatCorpusUpdater.LaundromatDocumentUpdater;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Temporary class for creating a seed list based on the laundromat.tsv file.
 * 
 * @author Michael R&ouml;der (michael.roeder@uni-paderborn.de)
 *
 */
public class TempSeedListCreator {

    private static final Logger LOGGER = LoggerFactory.getLogger(TempSeedListCreator.class);

    public static void main(String[] args) {
        if (args.length < 2) {
            System.err.println("Not enough arguments. Call the program as:");
            System.err.println("TempSeedListCreator <laundromat-tsv-file> <output-seed-file>");
            return;
        }
        try (Writer writer = new OutputStreamWriter(new FileOutputStream(args[1]), StandardCharsets.UTF_8)) {
            try (Reader reader = new InputStreamReader(new BufferedInputStream(new FileInputStream(args[0])),
                    StandardCharsets.UTF_8)) {
                StringBuilder builder = new StringBuilder();
                /*
                 * Status. 0 = new line started and we are waiting for the first whitespace. 1 =
                 * a whitespace has been found, we are waiting for the next non-whitespace which
                 * should be the start of the hash. 2 = reading the hash. 3 = hash ended,
                 * waiting for the URI. 4 = URI. 5 = URI ended, waiting for the end of line.
                 */
                int state = 0;
                char c;
                while (reader.ready()) {
                    c = (char) reader.read();
                    switch (state) {
                    case 0: {
                        if (Character.isWhitespace(c)) {
                            state = 1;
                        }
                        break;
                    }
                    case 1: {
                        if (!Character.isWhitespace(c)) {
                            ++state;
                        }
                        break;
                    }
                    case 2: {
                        if (Character.isWhitespace(c)) {
                            state = 3;
                        }
                        break;
                    }
                    case 3: {
                        if (!Character.isWhitespace(c)) {
                            ++state;
                            builder.append(c);
                        }
                        break;
                    }
                    case 4: {
                        if (Character.isWhitespace(c)) {
                            writer.write(builder.toString());
                            builder.delete(0, builder.length());
                            state = 5;
                        } else {
                            builder.append(c);
                        }
                        break;
                    }
                    case 5: {
                        if (c == '\n') {
                            state = 0;
                        }
                        break;
                    }
                    default: {
                        throw new IllegalStateException("Unknown state " + state);
                    }
                    }
                }
            }
        } catch (Exception e) {
            LOGGER.error("Exception while creating seed list from laundromat.tsv. Aborting.", e);
        }
    }
}
