package crawlercommons.robots;

import crawlercommons.robots.SimpleRobotRules;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:crawlercommons/robots/SimpleRobotRulesParser.class */
public class SimpleRobotRulesParser extends BaseRobotsParser {
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) SimpleRobotRulesParser.class);
    private static Map<String, RobotDirective> DIRECTIVE_PREFIX = new HashMap();
    private static final Pattern COLON_DIRECTIVE_DELIMITER;
    private static final Pattern BLANK_DIRECTIVE_DELIMITER;
    private static final Pattern DIRECTIVE_SUFFIX_PATTERN;
    private static final Pattern SIMPLE_HTML_PATTERN;
    private static final Pattern USER_AGENT_PATTERN;
    private static final int MAX_WARNINGS = 5;
    private static final long MAX_CRAWL_DELAY = 300000;
    private int _numWarnings;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:crawlercommons/robots/SimpleRobotRulesParser$ParseState.class */
    public static class ParseState {
        private boolean _matchedRealName;
        private boolean _matchedWildcard;
        private boolean _addingRules;
        private boolean _finishedAgentFields;
        private boolean _skipAgents;
        private String _url;
        private String _targetName;
        private SimpleRobotRules _curRules = new SimpleRobotRules();

        public ParseState(String str, String str2) {
            this._url = str;
            this._targetName = str2;
        }

        public String getTargetName() {
            return this._targetName;
        }

        public boolean isMatchedRealName() {
            return this._matchedRealName;
        }

        public void setMatchedRealName(boolean z) {
            this._matchedRealName = z;
        }

        public boolean isMatchedWildcard() {
            return this._matchedWildcard;
        }

        public void setMatchedWildcard(boolean z) {
            this._matchedWildcard = z;
        }

        public boolean isAddingRules() {
            return this._addingRules;
        }

        public void setAddingRules(boolean z) {
            this._addingRules = z;
        }

        public boolean isFinishedAgentFields() {
            return this._finishedAgentFields;
        }

        public void setFinishedAgentFields(boolean z) {
            this._finishedAgentFields = z;
        }

        public boolean isSkipAgents() {
            return this._skipAgents;
        }

        public void setSkipAgents(boolean z) {
            this._skipAgents = z;
        }

        public void clearRules() {
            this._curRules.clearRules();
        }

        public void addRule(String str, boolean z) {
            this._curRules.addRule(str, z);
        }

        public void setCrawlDelay(long j) {
            this._curRules.setCrawlDelay(j);
        }

        public SimpleRobotRules getRobotRules() {
            return this._curRules;
        }

        public String getUrl() {
            return this._url;
        }

        public void addSitemap(String str) {
            this._curRules.addSitemap(str);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:crawlercommons/robots/SimpleRobotRulesParser$RobotDirective.class */
    public enum RobotDirective {
        USER_AGENT,
        DISALLOW,
        ALLOW,
        CRAWL_DELAY,
        SITEMAP,
        HOST,
        NO_INDEX,
        ACAP_(true, false),
        REQUEST_RATE,
        VISIT_TIME,
        ROBOT_VERSION,
        COMMENT,
        HTTP,
        UNKNOWN(false, true),
        MISSING(false, true);

        private boolean _prefix;
        private boolean _special;

        RobotDirective() {
            this._prefix = false;
            this._special = false;
        }

        RobotDirective(boolean z, boolean z2) {
            this._prefix = z;
            this._special = z2;
        }

        public boolean isSpecial() {
            return this._special;
        }

        public boolean isPrefix() {
            return this._prefix;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:crawlercommons/robots/SimpleRobotRulesParser$RobotToken.class */
    public static class RobotToken {
        private RobotDirective _directive;
        private String _data;

        public RobotToken(RobotDirective robotDirective, String str) {
            this._directive = robotDirective;
            this._data = str;
        }

        public RobotDirective getDirective() {
            return this._directive;
        }

        public String getData() {
            return this._data;
        }
    }

    private static RobotToken tokenize(String str) {
        String lowerCase = str.toLowerCase(Locale.getDefault());
        for (String str2 : DIRECTIVE_PREFIX.keySet()) {
            int length = str2.length();
            if (lowerCase.startsWith(str2)) {
                RobotDirective robotDirective = DIRECTIVE_PREFIX.get(str2);
                String substring = str.substring(length);
                if (robotDirective.isPrefix()) {
                    Matcher matcher = DIRECTIVE_SUFFIX_PATTERN.matcher(substring);
                    if (matcher.matches()) {
                        substring = matcher.group(1);
                    } else {
                        continue;
                    }
                }
                Matcher matcher2 = COLON_DIRECTIVE_DELIMITER.matcher(substring);
                if (!matcher2.matches()) {
                    matcher2 = BLANK_DIRECTIVE_DELIMITER.matcher(substring);
                }
                if (matcher2.matches()) {
                    return new RobotToken(robotDirective, matcher2.group(1).trim());
                }
            }
        }
        return COLON_DIRECTIVE_DELIMITER.matcher(lowerCase).matches() ? new RobotToken(RobotDirective.UNKNOWN, str) : new RobotToken(RobotDirective.MISSING, str);
    }

    @Override // crawlercommons.robots.BaseRobotsParser
    public BaseRobotRules failedFetch(int i) {
        SimpleRobotRules simpleRobotRules;
        if (i >= 200 && i < 300) {
            throw new IllegalStateException("Can't use status code constructor with 2xx response");
        }
        if (i >= 300 && i < 400) {
            simpleRobotRules = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_NONE);
            simpleRobotRules.setDeferVisits(true);
        } else if (i < 400 || i >= 500) {
            simpleRobotRules = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_NONE);
            simpleRobotRules.setDeferVisits(true);
        } else {
            simpleRobotRules = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
        }
        return simpleRobotRules;
    }

    @Override // crawlercommons.robots.BaseRobotsParser
    public BaseRobotRules parseContent(String str, byte[] bArr, String str2, String str3) {
        this._numWarnings = 0;
        if (bArr == null || bArr.length == 0) {
            return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
        }
        int length = bArr.length;
        int i = 0;
        String str4 = "us-ascii";
        if (length >= 3 && bArr[0] == -17 && bArr[1] == -69 && bArr[2] == -65) {
            i = 3;
            length -= 3;
            str4 = "UTF-8";
        } else if (length >= 2 && bArr[0] == -1 && bArr[1] == -2) {
            i = 2;
            length -= 2;
            str4 = "UTF-16LE";
        } else if (length >= 2 && bArr[0] == -2 && bArr[1] == -1) {
            i = 2;
            length -= 2;
            str4 = "UTF-16BE";
        }
        try {
            String str5 = new String(bArr, i, length, str4);
            boolean z = str2 != null && str2.toLowerCase(Locale.getDefault()).startsWith("text/html");
            boolean z2 = false;
            if (z || SIMPLE_HTML_PATTERN.matcher(str5).find()) {
                if (!USER_AGENT_PATTERN.matcher(str5).find()) {
                    LOGGER.trace("Found non-robots.txt HTML file: " + str);
                    return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
                }
                if (z) {
                    LOGGER.debug("HTML content type returned for robots.txt file: " + str);
                } else {
                    LOGGER.debug("Found HTML in robots.txt file: " + str);
                }
                z2 = true;
            }
            StringTokenizer stringTokenizer = new StringTokenizer(str5, "\n\r\u0085\u2028\u2029");
            ParseState parseState = new ParseState(str, str3.toLowerCase(Locale.getDefault()));
            while (stringTokenizer.hasMoreTokens()) {
                String nextToken = stringTokenizer.nextToken();
                if (z2) {
                    nextToken = nextToken.replaceAll("<[^>]+>", "");
                }
                int indexOf = nextToken.indexOf("#");
                if (indexOf >= 0) {
                    nextToken = nextToken.substring(0, indexOf);
                }
                String trim = nextToken.trim();
                if (trim.length() != 0) {
                    RobotToken robotToken = tokenize(trim);
                    switch (robotToken.getDirective()) {
                        case USER_AGENT:
                            handleUserAgent(parseState, robotToken);
                            break;
                        case DISALLOW:
                            handleDisallow(parseState, robotToken);
                            break;
                        case ALLOW:
                            handleAllow(parseState, robotToken);
                            break;
                        case CRAWL_DELAY:
                            handleCrawlDelay(parseState, robotToken);
                            break;
                        case SITEMAP:
                            handleSitemap(parseState, robotToken);
                            break;
                        case HTTP:
                            handleHttp(parseState, robotToken);
                            break;
                        case UNKNOWN:
                            reportWarning("Unknown directive in robots.txt file: " + trim, str);
                            parseState.setFinishedAgentFields(true);
                            break;
                        case MISSING:
                            reportWarning(String.format(Locale.getDefault(), "Unknown line in robots.txt file (size %d): %s", Integer.valueOf(bArr.length), trim), str);
                            parseState.setFinishedAgentFields(true);
                            break;
                    }
                }
            }
            SimpleRobotRules robotRules = parseState.getRobotRules();
            if (robotRules.getCrawlDelay() > 300000) {
                LOGGER.debug("Crawl delay exceeds max value - so disallowing all URLs: {}", str);
                return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_NONE);
            }
            robotRules.sortRules();
            return robotRules;
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException("Impossible unsupported encoding exception for " + str4);
        }
    }

    private void reportWarning(String str, String str2) {
        this._numWarnings++;
        if (this._numWarnings == 1) {
            LOGGER.warn("Problem processing robots.txt for {}", str2);
        }
        if (this._numWarnings < 5) {
            LOGGER.warn("\t {}", str);
        }
    }

    private void handleUserAgent(ParseState parseState, RobotToken robotToken) {
        if (parseState.isMatchedRealName()) {
            if (parseState.isFinishedAgentFields()) {
                parseState.setSkipAgents(true);
                return;
            }
            return;
        }
        if (parseState.isFinishedAgentFields()) {
            parseState.setFinishedAgentFields(false);
            parseState.setAddingRules(false);
        }
        for (String str : parseState.getTargetName().toLowerCase(Locale.getDefault()).split(",")) {
            String[] split = str.trim().split(StringUtils.SPACE);
            for (String str2 : robotToken.getData().split("[ \t,]")) {
                String lowerCase = str2.trim().toLowerCase(Locale.getDefault());
                if (!lowerCase.isEmpty()) {
                    if (!lowerCase.equals("*") || parseState.isMatchedWildcard()) {
                        int length = split.length;
                        int i = 0;
                        while (true) {
                            if (i >= length) {
                                break;
                            }
                            if (split[i].startsWith(lowerCase)) {
                                parseState.setMatchedRealName(true);
                                parseState.setAddingRules(true);
                                parseState.clearRules();
                                break;
                            }
                            i++;
                        }
                    } else {
                        parseState.setMatchedWildcard(true);
                        parseState.setAddingRules(true);
                    }
                }
            }
        }
    }

    private void handleDisallow(ParseState parseState, RobotToken robotToken) {
        if (parseState.isSkipAgents()) {
            return;
        }
        parseState.setFinishedAgentFields(true);
        if (parseState.isAddingRules()) {
            String data = robotToken.getData();
            try {
                String decode = URLDecoder.decode(data, "UTF-8");
                if (decode.length() == 0) {
                    parseState.clearRules();
                } else {
                    parseState.addRule(decode, false);
                }
            } catch (Exception e) {
                reportWarning("Error parsing robots rules - can't decode path: " + data, parseState.getUrl());
            }
        }
    }

    private void handleAllow(ParseState parseState, RobotToken robotToken) {
        if (parseState.isSkipAgents()) {
            return;
        }
        parseState.setFinishedAgentFields(true);
        if (parseState.isAddingRules()) {
            String data = robotToken.getData();
            try {
                data = URLDecoder.decode(data, "UTF-8");
            } catch (Exception e) {
                reportWarning("Error parsing robots rules - can't decode path: " + data, parseState.getUrl());
            }
            if (data.length() == 0) {
                parseState.clearRules();
            } else {
                parseState.addRule(data, true);
            }
        }
    }

    private void handleCrawlDelay(ParseState parseState, RobotToken robotToken) {
        if (parseState.isSkipAgents()) {
            return;
        }
        parseState.setFinishedAgentFields(true);
        if (parseState.isAddingRules()) {
            String data = robotToken.getData();
            if (data.length() > 0) {
                try {
                    if (data.indexOf(46) != -1) {
                        parseState.setCrawlDelay(Math.round(Double.parseDouble(data) * 1000.0d));
                    } else {
                        parseState.setCrawlDelay(Integer.parseInt(data) * 1000);
                    }
                } catch (Exception e) {
                    reportWarning("Error parsing robots rules - can't decode crawl delay: " + data, parseState.getUrl());
                }
            }
        }
    }

    private void handleSitemap(ParseState parseState, RobotToken robotToken) {
        String data = robotToken.getData();
        try {
            URL url = new URL(new URL(parseState.getUrl()), data);
            String host = url.getHost();
            if (host != null && host.length() > 0) {
                parseState.addSitemap(url.toExternalForm());
            }
        } catch (Exception e) {
            reportWarning("Invalid URL with sitemap directive: " + data, parseState.getUrl());
        }
    }

    private void handleHttp(ParseState parseState, RobotToken robotToken) {
        String data = robotToken.getData();
        if (data.contains("sitemap")) {
            handleSitemap(parseState, new RobotToken(RobotDirective.SITEMAP, "http:" + robotToken.getData()));
        } else {
            reportWarning("Found raw non-sitemap URL: http:" + data, parseState.getUrl());
        }
    }

    public int getNumWarnings() {
        return this._numWarnings;
    }

    static {
        for (RobotDirective robotDirective : RobotDirective.values()) {
            if (!robotDirective.isSpecial()) {
                DIRECTIVE_PREFIX.put(robotDirective.name().toLowerCase(Locale.getDefault()).replaceAll("_", "-"), robotDirective);
            }
        }
        DIRECTIVE_PREFIX.put("useragent", RobotDirective.USER_AGENT);
        DIRECTIVE_PREFIX.put("useg-agent", RobotDirective.USER_AGENT);
        DIRECTIVE_PREFIX.put("ser-agent", RobotDirective.USER_AGENT);
        DIRECTIVE_PREFIX.put("desallow", RobotDirective.DISALLOW);
        DIRECTIVE_PREFIX.put("dissalow", RobotDirective.DISALLOW);
        DIRECTIVE_PREFIX.put("dssalow", RobotDirective.DISALLOW);
        DIRECTIVE_PREFIX.put("dsallow", RobotDirective.DISALLOW);
        DIRECTIVE_PREFIX.put("crawl delay", RobotDirective.CRAWL_DELAY);
        COLON_DIRECTIVE_DELIMITER = Pattern.compile("[ \t]*:[ \t]*(.*)");
        BLANK_DIRECTIVE_DELIMITER = Pattern.compile("[ \t]+(.*)");
        DIRECTIVE_SUFFIX_PATTERN = Pattern.compile("[^: \t]+(.*)");
        SIMPLE_HTML_PATTERN = Pattern.compile("(?is)<(html|head|body)\\s*>");
        USER_AGENT_PATTERN = Pattern.compile("(?i)user-agent:");
    }
}
