package org.dice_research.topicmodeling.preprocessing.docsupplier.decorator;

import java.util.ArrayList;
import java.util.List;
import org.dice_research.topicmodeling.lang.Term;
import org.dice_research.topicmodeling.preprocessing.docsupplier.DocumentSupplier;
import org.dice_research.topicmodeling.utils.doc.Document;
import org.dice_research.topicmodeling.utils.doc.DocumentText;
import org.dice_research.topicmodeling.utils.doc.DocumentTextWithTermInfo;
import org.dice_research.topicmodeling.utils.doc.TermTokenizedText;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/dice_research/topicmodeling/preprocessing/docsupplier/decorator/DocumentTextWithTermInfoParsingSupplierDecorator.class */
public class DocumentTextWithTermInfoParsingSupplierDecorator extends AbstractDocumentSupplierDecorator {
    private static final Logger LOGGER = LoggerFactory.getLogger(DocumentTextWithTermInfoParsingSupplierDecorator.class);
    private static final String DEPRECATED_ENCODING_START = "<term><label>";

    public DocumentTextWithTermInfoParsingSupplierDecorator(DocumentSupplier documentSupplier) {
        super(documentSupplier);
    }

    @Override // org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.AbstractDocumentSupplierDecorator
    protected Document prepareDocument(Document document) {
        DocumentTextWithTermInfo documentTextWithTermInfo = (DocumentTextWithTermInfo) document.getProperty(DocumentTextWithTermInfo.class);
        if (documentTextWithTermInfo != null) {
            createTextAndTerms(documentTextWithTermInfo, document);
            return document;
        }
        LOGGER.error("Got a document without the needed DocumentTextWithTermInfo property. Ignoring this document.");
        return null;
    }

    private boolean containsForDeprecatedEncode(String str) {
        return str.contains(DEPRECATED_ENCODING_START);
    }

    private void createTextAndTerms(DocumentTextWithTermInfo documentTextWithTermInfo, Document document) {
        String textWithTerms = documentTextWithTermInfo.getTextWithTerms();
        if (containsForDeprecatedEncode(textWithTerms)) {
            LOGGER.warn("The DocumentTextWithTermInfo value contains a deprecated encoding. It is possible that the parsing won't find any terms.");
        }
        StringBuilder sb = new StringBuilder();
        TermTokenizedText termTokenizedText = new TermTokenizedText();
        List termTokenizedText2 = termTokenizedText.getTermTokenizedText();
        char[] charArray = textWithTerms.toCharArray();
        StringBuilder sb2 = new StringBuilder();
        StringBuilder sb3 = sb;
        ArrayList arrayList = new ArrayList(4);
        boolean z = false;
        boolean z2 = false;
        for (int i = 0; i < charArray.length; i++) {
            switch (charArray[i]) {
                case DocumentTextWithTermInfoCreatingSupplierDecorator.TERM_START_CHAR /* 91 */:
                    if (z) {
                        sb3.append(charArray[i]);
                        z = false;
                        break;
                    } else if (z2) {
                        LOGGER.warn("Found an unescaped term start character '" + charArray[i] + "' inside of a term. It will be handled like an escaped one.");
                        sb3.append(charArray[i]);
                        break;
                    } else {
                        z2 = true;
                        sb3 = sb2;
                        break;
                    }
                case DocumentTextWithTermInfoCreatingSupplierDecorator.ESCAPE_CHAR /* 92 */:
                    if (z) {
                        sb3.append(charArray[i]);
                        z = false;
                        break;
                    } else {
                        z = true;
                        break;
                    }
                case DocumentTextWithTermInfoCreatingSupplierDecorator.TERM_END_CHAR /* 93 */:
                    if (z) {
                        sb3.append(charArray[i]);
                        z = false;
                        break;
                    } else if (z2) {
                        if (arrayList.size() != 3) {
                            LOGGER.warn("Got a term with " + (arrayList.size() + 1) + " instead of the expected 4 parts. The term will be ignored.");
                        } else {
                            Term term = new Term((String) arrayList.get(0), (String) arrayList.get(1));
                            term.setPosTag((String) arrayList.get(2));
                            try {
                                term.prop.set(Long.parseLong(sb3.toString()));
                                termTokenizedText2.add(term);
                                sb.append(term.getWordForm());
                            } catch (Exception e) {
                                LOGGER.error("Couldn't parse the properties of the term from \"" + sb3.toString() + "\". The term will be ignored.", e);
                            }
                        }
                        sb3 = sb;
                        sb2.delete(0, sb2.length());
                        arrayList.clear();
                        z2 = false;
                        break;
                    } else {
                        LOGGER.warn("Found an unescaped term start character '" + charArray[i] + "' inside of a term. It will be handled like an escaped one.");
                        sb3.append(charArray[i]);
                        break;
                    }
                case DocumentTextWithTermInfoCreatingSupplierDecorator.SEPARATION_CHAR /* 124 */:
                    if (z) {
                        sb3.append(charArray[i]);
                        z = false;
                        break;
                    } else if (z2) {
                        arrayList.add(sb2.toString());
                        sb2.delete(0, sb2.length());
                        break;
                    } else {
                        LOGGER.warn("Found an unescaped separation character '" + charArray[i] + "' outside of a term. It will be handled like an escaped one.");
                        sb3.append(charArray[i]);
                        break;
                    }
                default:
                    sb3.append(charArray[i]);
                    break;
            }
        }
        if (sb2.length() > 0 || arrayList.size() > 0) {
            LOGGER.warn("There was an unclosed term at the end of the text. It will be ignored.");
        }
        document.addProperty(new DocumentText(sb.toString()));
        document.addProperty(termTokenizedText);
    }

    public static String unescapeString(String str) {
        char[] charArray = str.toCharArray();
        StringBuilder sb = new StringBuilder(charArray.length);
        boolean z = false;
        for (int i = 0; i < charArray.length; i++) {
            switch (charArray[i]) {
                case DocumentTextWithTermInfoCreatingSupplierDecorator.ESCAPE_CHAR /* 92 */:
                    if (z) {
                        sb.append(charArray[i]);
                        z = false;
                        break;
                    } else {
                        z = true;
                        break;
                    }
                default:
                    sb.append(charArray[i]);
                    break;
            }
        }
        return sb.toString();
    }
}
