package edu.uci.ics.crawler4j.crawler;

import edu.uci.ics.crawler4j.fetcher.CustomFetchStatus;
import edu.uci.ics.crawler4j.fetcher.PageFetchResult;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.frontier.DocIDServer;
import edu.uci.ics.crawler4j.frontier.Frontier;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.parser.ParseData;
import edu.uci.ics.crawler4j.parser.Parser;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.WebURL;
import java.util.ArrayList;
import org.apache.log4j.Logger;

/* loaded from: input_file:edu/uci/ics/crawler4j/crawler/WebCrawler.class */
public class WebCrawler implements Runnable {
    protected static final Logger logger = Logger.getLogger(WebCrawler.class.getName());
    protected int myId;
    protected CrawlController myController;
    private Thread myThread;
    private Parser parser;
    private PageFetcher pageFetcher;
    private RobotstxtServer robotstxtServer;
    private DocIDServer docIdServer;
    private Frontier frontier;
    private boolean isWaitingForNewURLs;

    public void init(int i, CrawlController crawlController) {
        this.myId = i;
        this.pageFetcher = crawlController.getPageFetcher();
        this.robotstxtServer = crawlController.getRobotstxtServer();
        this.docIdServer = crawlController.getDocIdServer();
        this.frontier = crawlController.getFrontier();
        this.parser = new Parser(crawlController.getConfig());
        this.myController = crawlController;
        this.isWaitingForNewURLs = false;
    }

    public int getMyId() {
        return this.myId;
    }

    public CrawlController getMyController() {
        return this.myController;
    }

    public void onStart() {
    }

    public void onBeforeExit() {
    }

    protected void handlePageStatusCode(WebURL webURL, int i, String str) {
    }

    protected void onContentFetchError(WebURL webURL) {
    }

    protected void onParseError(WebURL webURL) {
    }

    public Object getMyLocalData() {
        return null;
    }

    @Override // java.lang.Runnable
    public void run() {
        onStart();
        while (true) {
            ArrayList<WebURL> arrayList = new ArrayList(50);
            this.isWaitingForNewURLs = true;
            this.frontier.getNextURLs(50, arrayList);
            this.isWaitingForNewURLs = false;
            if (arrayList.size() != 0) {
                for (WebURL webURL : arrayList) {
                    if (webURL != null) {
                        processPage(webURL);
                        this.frontier.setProcessed(webURL);
                    }
                    if (this.myController.isShuttingDown()) {
                        logger.info("Exiting because of controller shutdown.");
                        return;
                    }
                }
            } else {
                if (this.frontier.isFinished()) {
                    return;
                }
                try {
                    Thread.sleep(3000L);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }
        }
    }

    public boolean shouldVisit(WebURL webURL) {
        return true;
    }

    public void visit(Page page) {
    }

    private void processPage(WebURL webURL) {
        if (webURL == null) {
            return;
        }
        PageFetchResult pageFetchResult = null;
        try {
            try {
                PageFetchResult fetchHeader = this.pageFetcher.fetchHeader(webURL);
                int statusCode = fetchHeader.getStatusCode();
                handlePageStatusCode(webURL, statusCode, CustomFetchStatus.getStatusDescription(statusCode));
                if (statusCode != 200) {
                    if (statusCode == 301 || statusCode == 302) {
                        if (this.myController.getConfig().isFollowRedirects()) {
                            String movedToUrl = fetchHeader.getMovedToUrl();
                            if (movedToUrl == null) {
                                if (fetchHeader != null) {
                                    fetchHeader.discardContentIfNotConsumed();
                                    return;
                                }
                                return;
                            }
                            if (this.docIdServer.getDocId(movedToUrl) > 0) {
                                if (fetchHeader != null) {
                                    fetchHeader.discardContentIfNotConsumed();
                                    return;
                                }
                                return;
                            }
                            WebURL webURL2 = new WebURL();
                            webURL2.setURL(movedToUrl);
                            webURL2.setParentDocid(webURL.getParentDocid());
                            webURL2.setParentUrl(webURL.getParentUrl());
                            webURL2.setDepth(webURL.getDepth());
                            webURL2.setDocid(-1);
                            webURL2.setAnchor(webURL.getAnchor());
                            if (shouldVisit(webURL2) && this.robotstxtServer.allows(webURL2)) {
                                webURL2.setDocid(this.docIdServer.getNewDocID(movedToUrl));
                                this.frontier.schedule(webURL2);
                            }
                        }
                    } else if (fetchHeader.getStatusCode() == 1001) {
                        logger.info("Skipping a page which was bigger than max allowed size: " + webURL.getURL());
                    }
                    if (fetchHeader != null) {
                        fetchHeader.discardContentIfNotConsumed();
                        return;
                    }
                    return;
                }
                if (!webURL.getURL().equals(fetchHeader.getFetchedUrl())) {
                    if (this.docIdServer.isSeenBefore(fetchHeader.getFetchedUrl())) {
                        if (fetchHeader != null) {
                            fetchHeader.discardContentIfNotConsumed();
                            return;
                        }
                        return;
                    }
                    webURL.setURL(fetchHeader.getFetchedUrl());
                    webURL.setDocid(this.docIdServer.getNewDocID(fetchHeader.getFetchedUrl()));
                }
                Page page = new Page(webURL);
                int docid = webURL.getDocid();
                if (!fetchHeader.fetchContent(page)) {
                    onContentFetchError(webURL);
                    if (fetchHeader != null) {
                        fetchHeader.discardContentIfNotConsumed();
                        return;
                    }
                    return;
                }
                if (!this.parser.parse(page, webURL.getURL())) {
                    onParseError(webURL);
                    if (fetchHeader != null) {
                        fetchHeader.discardContentIfNotConsumed();
                        return;
                    }
                    return;
                }
                ParseData parseData = page.getParseData();
                if (parseData instanceof HtmlParseData) {
                    HtmlParseData htmlParseData = (HtmlParseData) parseData;
                    ArrayList arrayList = new ArrayList();
                    int maxDepthOfCrawling = this.myController.getConfig().getMaxDepthOfCrawling();
                    for (WebURL webURL3 : htmlParseData.getOutgoingUrls()) {
                        webURL3.setParentDocid(docid);
                        webURL3.setParentUrl(webURL.getURL());
                        int docId = this.docIdServer.getDocId(webURL3.getURL());
                        if (docId > 0) {
                            webURL3.setDepth((short) -1);
                            webURL3.setDocid(docId);
                        } else {
                            webURL3.setDocid(-1);
                            webURL3.setDepth((short) (webURL.getDepth() + 1));
                            if ((maxDepthOfCrawling == -1 || webURL.getDepth() < maxDepthOfCrawling) && shouldVisit(webURL3) && this.robotstxtServer.allows(webURL3)) {
                                webURL3.setDocid(this.docIdServer.getNewDocID(webURL3.getURL()));
                                arrayList.add(webURL3);
                            }
                        }
                    }
                    this.frontier.scheduleAll(arrayList);
                }
                try {
                    visit(page);
                } catch (Exception e) {
                    logger.error("Exception while running the visit method. Message: '" + e.getMessage() + "' at " + e.getStackTrace()[0]);
                }
                if (fetchHeader != null) {
                    fetchHeader.discardContentIfNotConsumed();
                }
            } catch (Exception e2) {
                logger.error(e2.getMessage() + ", while processing: " + webURL.getURL());
                if (0 != 0) {
                    pageFetchResult.discardContentIfNotConsumed();
                }
            }
        } catch (Throwable th) {
            if (0 != 0) {
                pageFetchResult.discardContentIfNotConsumed();
            }
            throw th;
        }
    }

    public Thread getThread() {
        return this.myThread;
    }

    public void setThread(Thread thread) {
        this.myThread = thread;
    }

    public boolean isNotWaitingForNewURLs() {
        return !this.isWaitingForNewURLs;
    }
}
