/*******************************************************************************
 * Copyright 2014
 * FG Language Technology
 * Technische Universität Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tu.darmstadt.lt.ner.reader;

import com.google.common.collect.Lists;
import de.tu.darmstadt.lt.ner.types.FreeBaseMatch;
import de.tu.darmstadt.lt.ner.types.GoldNamedEntity;
import de.tu.darmstadt.lt.ner.types.PositionInSentence;
import de.tu.darmstadt.lt.ner.util.Configuration;
import de.tu.darmstadt.lt.ner.util.GenerateNgram;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import lombok.val;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;

import java.io.*;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

@Slf4j
public class NERReader extends JCasAnnotator_ImplBase {

    public static final String CONLL_VIEW = "ConnlView";
    static final String FREEBASE_FILENAME = "freebase_2502.txt3";

    public static final String CONFIGURATION = "configuration";
    @ConfigurationParameter(name = CONFIGURATION)
    private Configuration configuration = null;

    public static final String DATA_ZIP_URL = "DataZipURL";
    @ConfigurationParameter(name = DATA_ZIP_URL, mandatory = false)
    private static URL dataZipURL = null;

    private Logger logger = null;

    @Override
    public void initialize(UimaContext context)
            throws ResourceInitializationException {
        super.initialize(context);
        logger = context.getLogger();
    }

    @Override
    public void process(JCas jcas)
            throws AnalysisEngineProcessException {
        log.info("NERReader#process");

        JCas docView;
        String tbText;
        try {
            docView = jcas.getView(CAS.NAME_DEFAULT_SOFA);
            tbText = jcas.getView(CONLL_VIEW).getDocumentText();
        } catch (CASException e) {
            throw new AnalysisEngineProcessException(e);
        }
        // a new sentence always starts with a new line
        if (tbText.charAt(0) != '\n') {
            tbText = "\n" + tbText;
        }

        String[] tokens = tbText.split("(\r\n|\n)");
        Sentence sentence = null;
        int idx = 0;
        Token token = null;
        GoldNamedEntity namedEntityTag;
        String namedEntity;
        List<FreeBaseMatch> freeBaseMatches = Lists.newLinkedList();
        int positionIndex = 0;
        boolean initSentence = false;
        StringBuffer docText = new StringBuffer();
        StringBuffer sentenceSb = new StringBuffer();

        for (String line : tokens) {

            // new sentence if there's a new line
            if (line.equals("")) {
                if (sentence != null && token != null) {
                    terminateSentence(sentence, token, docText);
                    docText.append("\n");
                    idx++;
                    if (configuration.useFreeBase()) {
                        saveFreeBaseMatches(sentenceSb, freeBaseMatches);
                        freeBaseMatches.clear();
                    }
                    positionIndex = 0;
                }
                // init new sentence with the next recognized token
                initSentence = true;
                sentenceSb = new StringBuffer();
            } else {
                String[] tag = line.split("\\t");
                String word = tag[0];
                namedEntity = tag[tag.length - 1];



                docText.append(word);
                sentenceSb.append(word + " ");

                // if (!word.matches("^(\\p{Punct}).*")) {
                token = new Token(docView, idx, idx + word.length());
                namedEntityTag = new GoldNamedEntity(docView, idx, idx + word.length());
                // sw=new SimilarWord1(docView, idx, idx + word.length());
                if (configuration.useFreeBase()) {
                    freeBaseMatches.add(new FreeBaseMatch(docView, idx, idx + word.length()));
                }

                if (configuration.usePosition()) {

                    val positition = new PositionInSentence(docView, idx, idx + word.length());

                    positition.setPosition(positionIndex);
                    positition.addToIndexes();

                    positionIndex++;
                }

                docText.append(" ");
                idx++;

                // start new sentence
                if (initSentence) {
                    sentence = new Sentence(docView);
                    sentence.setBegin(token.getBegin());
                    initSentence = false;
                }
                // increment actual index of text
                idx += word.length();
                namedEntityTag.setNamedEntityType(namedEntity);

                // sw.setValue(tag[16]);

                // sw.addToIndexes();

                namedEntityTag.addToIndexes();
                token.addToIndexes();

                logger.log(Level.FINE,
                        "Token: [" + docText.substring(token.getBegin(), token.getEnd()) + "]"
                                + token.getBegin() + "\t" + token.getEnd());
                logger.log(
                        Level.FINE,
                        "NamedEnity: ["
                                + docText.substring(namedEntityTag.getBegin(),
                                namedEntityTag.getEnd()) + "]" + namedEntityTag.getBegin()
                                + "\t" + namedEntityTag.getEnd());
            }
        }

        if (configuration.useFreeBase()) {
            saveFreeBaseMatches(sentenceSb, freeBaseMatches);
        }

        if (sentence != null && token != null) {
            terminateSentence(sentence, token, docText);
        }

        docView.setSofaDataString(docText.toString(), "text/plain");
    }

    @SneakyThrows(ExecutionException.class)
    private List<String> getngramBasedFreebaseList(StringBuffer sentenceSb) {
        List<String> tagsForSentence = Lists.newArrayList();

        val freeBaseMap = NERLookupCaching.twoColumMappingCache().get("freebase_2502.txt3");

        // do 1-5 gram freebase checklists
        outer:
        for (String sentToken : sentenceSb.toString().trim().split(" ")) {
            for (int i = 5; i > 0; i--) {
                try {
                    for (String nGramToken : GenerateNgram.generateNgramsUpto(
                            sentenceSb.toString(), i)) {
                        if (nGramToken.split(" ").length == 0 && !nGramToken.equals(sentToken)) {
                            continue;
                        }
                        if (nGramToken.contains(sentToken) && freeBaseMap.get(nGramToken) != null) {
                            if (nGramToken.startsWith(sentToken)) {
                                String tag = "B-" + freeBaseMap.get(nGramToken);
                                tagsForSentence.add(tag);
                                continue outer;
                            } else {
                                String tag = "I-" + freeBaseMap.get(nGramToken);
                                tagsForSentence.add(tag);
                                continue outer;
                            }
                        }
                    }
                } catch (Exception e) {
                    tagsForSentence.add("none");
                    continue outer;
                }
            }
            tagsForSentence.add("none");
        }

        return tagsForSentence;
    }

    private void saveFreeBaseMatches(StringBuffer sentenceSb, List<FreeBaseMatch> featureStructures) {

        if(sentenceSb.length() == 0) {
            return;
        }

        List<String> freeBaseTags = getngramBasedFreebaseList(sentenceSb);

        if (freeBaseTags.size() != featureStructures.size()) {
            throw new RuntimeException(String.format("length mismatch! %d != %d%n%s%n%s%n\"%s\"%n",
                    freeBaseTags.size(), featureStructures.size(), freeBaseTags,
                    featureStructures, sentenceSb));
        }

        Iterator<String> tagsIter = freeBaseTags.iterator();

        for (FreeBaseMatch freeBaseMatch : featureStructures) {
            freeBaseMatch.setBITag(tagsIter.next());
            freeBaseMatch.addToIndexes();
        }
    }

    private void terminateSentence(Sentence sentence, Token token, StringBuffer docText) {
        sentence.setEnd(token.getEnd());
        sentence.addToIndexes();
        logger.log(Level.FINE,
                "Sentence:[" + docText.substring(sentence.getBegin(), sentence.getEnd()) + "]\t"
                        + sentence.getBegin() + "\t" + sentence.getEnd());
    }

    //TODO: make this method obsoltete (no proper resource management)
    public Reader getReader(String aName)
            throws IOException {
        InputStream is;
        if (dataZipURL != null) {
            is = dataZipURL.openStream();
        } else {
            is = ClassLoader.getSystemResourceAsStream("data.zip");
        }

        ZipInputStream zis = new ZipInputStream(is);

        ZipEntry entry = zis.getNextEntry();
        while (entry != null) {
            if (entry.toString().equals(aName)) {
                return new BufferedReader(new InputStreamReader(zis));

            }
            entry = zis.getNextEntry();
        }
        return null;
    }
}