package org.dice_research.topicmodeling.preprocessing.docsupplier.decorator;

import com.ibm.icu.text.CharsetDetector;
import java.nio.charset.Charset;
import org.dice_research.topicmodeling.preprocessing.docsupplier.DocumentSupplier;
import org.dice_research.topicmodeling.utils.doc.Document;
import org.dice_research.topicmodeling.utils.doc.DocumentCharset;
import org.dice_research.topicmodeling.utils.doc.DocumentRawData;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/dice_research/topicmodeling/preprocessing/docsupplier/decorator/CharsetDeterminingSupplierDecorator.class */
public class CharsetDeterminingSupplierDecorator extends AbstractDocumentSupplierDecorator {
    private static final Logger LOGGER = LoggerFactory.getLogger(CharsetDeterminingSupplierDecorator.class);

    public CharsetDeterminingSupplierDecorator(DocumentSupplier documentSupplier) {
        super(documentSupplier);
    }

    @Override // org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.AbstractDocumentSupplierDecorator
    protected Document prepareDocument(Document document) {
        DocumentRawData property = document.getProperty(DocumentRawData.class);
        if (property == null) {
            LOGGER.error("Got a document (#" + document.getDocumentId() + ") without the needed DocumentRawData property. Ignoring it.");
        } else {
            Charset detectCharset = detectCharset(property.getData());
            if (detectCharset != null) {
                document.addProperty(new DocumentCharset(detectCharset));
            }
        }
        return document;
    }

    private Charset detectCharset(byte[] bArr) {
        CharsetDetector charsetDetector = new CharsetDetector();
        charsetDetector.setText(bArr);
        try {
            return Charset.forName(charsetDetector.detect().getName());
        } catch (Exception e) {
            LOGGER.warn("Couldn't determine the charset of the given data. Returning null.");
            return null;
        }
    }
}
