package cmu.arktweetnlp;

import cmu.arktweetnlp.impl.ModelSentence;
import cmu.arktweetnlp.impl.Sentence;
import cmu.arktweetnlp.impl.features.WordClusterPaths;
import cmu.arktweetnlp.io.CoNLLReader;
import cmu.arktweetnlp.io.JsonTweetReader;
import cmu.arktweetnlp.util.BasicFileIO;
import edu.stanford.nlp.util.StringUtils;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashSet;

/* loaded from: input_file:cmu/arktweetnlp/RunTagger.class */
public class RunTagger {
    Tagger tagger;
    String inputFilename;
    private static HashSet<String> _wordsInCluster;
    String inputFormat = "auto";
    String outputFormat = "auto";
    int inputField = 1;
    String modelFilename = "/cmu/arktweetnlp/model.20120919";
    public boolean noOutput = false;
    public boolean justTokenize = false;
    public Decoder decoder = Decoder.GREEDY;
    public boolean showConfidence = true;
    Iterable<Sentence> inputIterable = null;
    int numTokensCorrect = 0;
    int numTokens = 0;
    int oovTokensCorrect = 0;
    int oovTokens = 0;
    int clusterTokensCorrect = 0;
    int clusterTokens = 0;
    PrintStream outputStream = new PrintStream((OutputStream) System.out, true, "UTF-8");

    /* loaded from: input_file:cmu/arktweetnlp/RunTagger$Decoder.class */
    public enum Decoder {
        GREEDY,
        VITERBI
    }

    public static void die(String str) {
        System.err.println(str);
        System.exit(-1);
    }

    public void detectAndSetInputFormat(String str) throws IOException {
        if (new JsonTweetReader().isJson(str)) {
            System.err.println("Detected JSON input format");
            this.inputFormat = "json";
        } else {
            System.err.println("Detected text input format");
            this.inputFormat = "text";
        }
    }

    public void runTagger() throws IOException, ClassNotFoundException {
        int i;
        String str;
        this.tagger = new Tagger();
        if (!this.justTokenize) {
            this.tagger.loadModel(this.modelFilename);
        }
        if (this.inputFormat.equals("conll")) {
            runTaggerInEvalMode();
            return;
        }
        JsonTweetReader jsonTweetReader = new JsonTweetReader();
        LineNumberReader lineNumberReader = new LineNumberReader(BasicFileIO.openFileToReadUTF8(this.inputFilename));
        long currentTimeMillis = System.currentTimeMillis();
        int i2 = 0;
        while (true) {
            i = i2;
            String readLine = lineNumberReader.readLine();
            if (readLine == null) {
                break;
            }
            String str2 = readLine.split("\t")[this.inputField - 1];
            if (lineNumberReader.getLineNumber() == 1 && this.inputFormat.equals("auto")) {
                detectAndSetInputFormat(str2);
            }
            if (this.inputFormat.equals("json")) {
                str = jsonTweetReader.getText(str2);
                if (str == null) {
                    System.err.println("Warning, null text (JSON parse error?), using blank string instead");
                    str = "";
                }
            } else {
                str = str2;
            }
            Sentence sentence = new Sentence();
            sentence.tokens = Twokenize.tokenizeRawTweetText(str);
            ModelSentence modelSentence = null;
            if (sentence.T() > 0 && !this.justTokenize) {
                modelSentence = new ModelSentence(sentence.T());
                this.tagger.featureExtractor.computeFeatures(sentence, modelSentence);
                goDecode(modelSentence);
            }
            if (this.outputFormat.equals("conll")) {
                outputJustTagging(sentence, modelSentence);
            } else {
                outputPrependedTagging(sentence, modelSentence, this.justTokenize, readLine);
            }
            i2 = i + sentence.T();
        }
        long currentTimeMillis2 = System.currentTimeMillis();
        PrintStream printStream = System.err;
        Object[] objArr = new Object[6];
        objArr[0] = this.justTokenize ? "" : " and tagged";
        objArr[1] = Integer.valueOf(lineNumberReader.getLineNumber());
        objArr[2] = Integer.valueOf(i);
        objArr[3] = Double.valueOf((currentTimeMillis2 - currentTimeMillis) / 1000.0d);
        objArr[4] = Double.valueOf(lineNumberReader.getLineNumber() / ((currentTimeMillis2 - currentTimeMillis) / 1000.0d));
        objArr[5] = Double.valueOf(i / ((currentTimeMillis2 - currentTimeMillis) / 1000.0d));
        printStream.printf("Tokenized%s %d tweets (%d tokens) in %.1f seconds: %.1f tweets/sec, %.1f tokens/sec\n", objArr);
        lineNumberReader.close();
    }

    public void goDecode(ModelSentence modelSentence) {
        if (this.decoder == Decoder.GREEDY) {
            this.tagger.model.greedyDecode(modelSentence, this.showConfidence);
        } else if (this.decoder == Decoder.VITERBI) {
            this.tagger.model.viterbiDecode(modelSentence);
        }
    }

    public void runTaggerInEvalMode() throws IOException, ClassNotFoundException {
        long currentTimeMillis = System.currentTimeMillis();
        int i = 0;
        ArrayList<Sentence> readFile = CoNLLReader.readFile(this.inputFilename);
        this.inputIterable = readFile;
        int[][] iArr = new int[this.tagger.model.numLabels][this.tagger.model.numLabels];
        for (Sentence sentence : readFile) {
            i++;
            ModelSentence modelSentence = new ModelSentence(sentence.T());
            this.tagger.featureExtractor.computeFeatures(sentence, modelSentence);
            goDecode(modelSentence);
            if (!this.noOutput) {
                outputJustTagging(sentence, modelSentence);
            }
            evaluateSentenceTagging(sentence, modelSentence);
        }
        System.err.printf("%d / %d correct = %.4f acc, %.4f err\n", Integer.valueOf(this.numTokensCorrect), Integer.valueOf(this.numTokens), Double.valueOf((this.numTokensCorrect * 1.0d) / this.numTokens), Double.valueOf(1.0d - ((this.numTokensCorrect * 1.0d) / this.numTokens)));
        double currentTimeMillis2 = (System.currentTimeMillis() - currentTimeMillis) / 1000.0d;
        System.err.printf("%d tweets in %.1f seconds, %.1f tweets/sec\n", Integer.valueOf(i), Double.valueOf(currentTimeMillis2), Double.valueOf((i * 1.0d) / currentTimeMillis2));
    }

    private void evaluateOOV(Sentence sentence, ModelSentence modelSentence) throws FileNotFoundException, IOException, ClassNotFoundException {
        for (int i = 0; i < modelSentence.T; i++) {
            int num = this.tagger.model.labelVocab.num(sentence.labels.get(i));
            int i2 = modelSentence.labels[i];
            if (wordsInCluster().contains(sentence.tokens.get(i))) {
                this.oovTokensCorrect += num == i2 ? 1 : 0;
                this.oovTokens++;
            }
        }
    }

    private void getconfusion(Sentence sentence, ModelSentence modelSentence, int[][] iArr) {
        for (int i = 0; i < modelSentence.T; i++) {
            int num = this.tagger.model.labelVocab.num(sentence.labels.get(i));
            int i2 = modelSentence.labels[i];
            if (num != -1) {
                int[] iArr2 = iArr[num];
                iArr2[i2] = iArr2[i2] + 1;
            }
        }
    }

    public void evaluateSentenceTagging(Sentence sentence, ModelSentence modelSentence) {
        for (int i = 0; i < modelSentence.T; i++) {
            this.numTokensCorrect += this.tagger.model.labelVocab.num(sentence.labels.get(i)) == modelSentence.labels[i] ? 1 : 0;
            this.numTokens++;
        }
    }

    private String formatConfidence(double d) {
        return String.format("%.4f", Double.valueOf(d));
    }

    public void outputJustTagging(Sentence sentence, ModelSentence modelSentence) {
        if (!this.outputFormat.equals("conll")) {
            die("bad output format for just tagging: " + this.outputFormat);
            return;
        }
        for (int i = 0; i < sentence.T(); i++) {
            this.outputStream.printf("%s\t%s", sentence.tokens.get(i), this.tagger.model.labelVocab.name(modelSentence.labels[i]));
            if (modelSentence.confidences != null) {
                this.outputStream.printf("\t%s", formatConfidence(modelSentence.confidences[i]));
            }
            this.outputStream.printf("\n", new Object[0]);
        }
        this.outputStream.println("");
    }

    public void outputPrependedTagging(Sentence sentence, ModelSentence modelSentence, boolean z, String str) {
        int T = sentence.T();
        String[] strArr = new String[T];
        String[] strArr2 = new String[T];
        String[] strArr3 = new String[T];
        for (int i = 0; i < T; i++) {
            strArr[i] = sentence.tokens.get(i);
            if (!z) {
                strArr2[i] = this.tagger.model.labelVocab.name(modelSentence.labels[i]);
            }
            if (this.showConfidence) {
                strArr3[i] = formatConfidence(modelSentence.confidences[i]);
            }
        }
        StringBuilder sb = new StringBuilder();
        sb.append(StringUtils.join(strArr));
        sb.append("\t");
        if (!z) {
            sb.append(StringUtils.join(strArr2));
            sb.append("\t");
        }
        if (this.showConfidence) {
            sb.append(StringUtils.join(strArr3));
            sb.append("\t");
        }
        sb.append(str);
        this.outputStream.println(sb.toString());
    }

    public static void main(String[] strArr) throws IOException, ClassNotFoundException {
        if (strArr.length > 0 && (strArr[0].equals("-h") || strArr[0].equals("--help"))) {
            usage();
        }
        RunTagger runTagger = new RunTagger();
        int i = 0;
        while (i < strArr.length && strArr[i].startsWith("-")) {
            if (strArr[i].equals("--model")) {
                runTagger.modelFilename = strArr[i + 1];
                i += 2;
            } else if (strArr[i].equals("--just-tokenize")) {
                runTagger.justTokenize = true;
                i++;
            } else if (strArr[i].equals("--decoder")) {
                if (strArr[i + 1].equals("viterbi")) {
                    runTagger.decoder = Decoder.VITERBI;
                } else if (strArr[i + 1].equals("greedy")) {
                    runTagger.decoder = Decoder.GREEDY;
                } else {
                    die("unknown decoder " + strArr[i + 1]);
                }
                i += 2;
            } else if (strArr[i].equals("--quiet")) {
                runTagger.noOutput = true;
                i++;
            } else if (strArr[i].equals("--input-format")) {
                String str = strArr[i + 1];
                if (!str.equals("json") && !str.equals("text") && !str.equals("conll")) {
                    usage("input format must be: json, text, or conll");
                }
                runTagger.inputFormat = strArr[i + 1];
                i += 2;
            } else if (strArr[i].equals("--output-format")) {
                runTagger.outputFormat = strArr[i + 1];
                i += 2;
            } else if (strArr[i].equals("--input-field")) {
                runTagger.inputField = Integer.parseInt(strArr[i + 1]);
                i += 2;
            } else if (strArr[i].equals("--word-clusters")) {
                WordClusterPaths.clusterResourceName = strArr[i + 1];
                i++;
            } else if (strArr[i].equals("--no-confidence")) {
                runTagger.showConfidence = false;
                i++;
            } else {
                System.out.println("bad option " + strArr[i]);
                usage();
            }
        }
        if (strArr.length - i > 1) {
            usage();
        }
        if (strArr.length == i || strArr[i].equals("-")) {
            System.err.println("Listening on stdin for input.  (-h for help)");
            runTagger.inputFilename = "/dev/stdin";
        } else {
            runTagger.inputFilename = strArr[i];
        }
        runTagger.finalizeOptions();
        runTagger.runTagger();
    }

    public void finalizeOptions() throws IOException {
        if (this.outputFormat.equals("auto")) {
            if (this.inputFormat.equals("conll")) {
                this.outputFormat = "conll";
            } else {
                this.outputFormat = "pretsv";
            }
        }
        if (this.showConfidence && this.decoder == Decoder.VITERBI) {
            System.err.println("Confidence output is unimplemented in Viterbi, turning it off.");
            this.showConfidence = false;
        }
        if (this.justTokenize) {
            this.showConfidence = false;
        }
    }

    public static void usage() {
        usage(null);
    }

    public static void usage(String str) {
        System.out.println("RunTagger [options] [ExamplesFilename]\n  runs the CMU ARK Twitter tagger on tweets from ExamplesFilename, \n  writing taggings to standard output. Listens on stdin if no input filename.\n\nOptions:\n  --model <Filename>        Specify model filename. (Else use built-in.)\n  --just-tokenize           Only run the tokenizer; no POS tags.\n  --quiet                   Quiet: no output\n  --input-format <Format>   Default: auto\n                            Options: json, text, conll\n  --output-format <Format>  Default: automatically decide from input format.\n                            Options: pretsv, conll\n  --input-field NUM         Default: 1\n                            Which tab-separated field contains the input\n                            (1-indexed, like unix 'cut')\n                            Only for {json, text} input formats.\n  --word-clusters <File>    Alternate word clusters file (see FeatureExtractor)\n  --no-confidence           Don't output confidence probabilities\n  --decoder <Decoder>       Change the decoding algorithm (default: greedy)\n\nTweet-per-line input formats:\n   json: Every input line has a JSON object containing the tweet,\n         as per the Streaming API. (The 'text' field is used.)\n   text: Every input line has the text for one tweet.\nWe actually assume input lines are TSV and the tweet data is one field.\n(Therefore tab characters are not allowed in tweets.\nTwitter's own JSON formats guarantee this;\nif you extract the text yourself, you must remove tabs and newlines.)\nTweet-per-line output format is\n   pretsv: Prepend the tokenization and tagging as new TSV fields, \n           so the output includes a complete copy of the input.\nBy default, three TSV fields are prepended:\n   Tokenization \\t POSTags \\t Confidences \\t (original data...)\nThe tokenization and tags are parallel space-separated lists.\nThe 'conll' format is token-per-line, blank spaces separating tweets.\n");
        if (str != null) {
            System.out.println("ERROR: " + str);
        }
        System.exit(1);
    }

    public static HashSet<String> wordsInCluster() {
        if (_wordsInCluster == null) {
            _wordsInCluster = new HashSet<>(WordClusterPaths.wordToPath.keySet());
        }
        return _wordsInCluster;
    }
}
