/*
 * Decompiled with CFR 0.152.
 */
package com.ibm.es.nuvo.crawler.web.http;

import com.ibm.es.nuvo.crawler.web.bucket.Bucket;
import com.ibm.es.nuvo.crawler.web.bucket.FederatedBucketManager;
import com.ibm.es.nuvo.crawler.web.configuration.space.CrawlSpaceConfig;
import com.ibm.es.nuvo.crawler.web.db.CrawlRec;
import com.ibm.es.nuvo.crawler.web.db.tables.URLTable;
import com.ibm.es.nuvo.crawler.web.http.URLSorter;
import com.ibm.es.nuvo.crawler.web.net.CURL;
import com.ibm.es.nuvo.crawler.web.net.IPProto;
import com.ibm.es.nuvo.crawler.web.rule.WebSpace;
import com.ibm.es.nuvo.crawler.web.thread.WCRunnableImpl;
import com.ibm.es.nuvo.logging.ExtendedLogger;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;

public class URLScanner
extends WCRunnableImpl {
    private static final String copyright = "IBM Confidential OCO Source Materials 5724-R21 \u00a9 Copyright IBM Corp.  2006, 2007.   All Rights Reserved. The source code for this program is not published or otherwise divested of its trade secrets, irrespective of what has been deposited with the U.S. Copyright Office.";
    private static final ExtendedLogger tracer = ExtendedLogger.getLogger("NuvoTracer." + URLScanner.class.getName());
    public static final String s_agent = "URLScan";
    private ArrayList<CrawlRec> newList = new ArrayList(600);
    private ArrayList<CrawlRec> oldList = new ArrayList(600);

    public URLScanner() {
        super(s_agent);
    }

    public void _clientInit() throws Exception {
        this._assignThread("Ant", s_agent);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    protected void _clientRunCore() throws Exception {
        FederatedBucketManager fedManager = FederatedBucketManager.instance();
        try {
            int oldLimit;
            this.setSubRunLevel("Requesting");
            if (!fedManager.needsMoreBucketsFilled()) {
                if (tracer.isLoggable(Level.FINE)) {
                    tracer.fine("No more buckets to be filled.");
                }
                this.sleep(2000);
                return;
            }
            Bucket bucket = fedManager.getEmptyBucket();
            if (bucket == null) {
                if (tracer.isLoggable(Level.FINE)) {
                    tracer.fine("No buckets to be filled.");
                }
                this.sleep(500);
                return;
            }
            this.setSubRunLevel("Scanning");
            URLTable urlTable = URLTable.getInstance(bucket.getConfig().getId());
            URLSorter sorter = URLSorter.instance();
            List<CrawlRec> crawlRecs = urlTable.getCrawlRecs(bucket);
            if (tracer.isLoggable(Level.FINEST)) {
                tracer.finest(bucket.getHostname() + " size " + crawlRecs.size());
            }
            int newLimit = oldLimit = bucket.getMaxURL() / 2;
            boolean canVerify = !bucket.needsRobotsUpdate() && !bucket.needsDNSUpdate();
            CrawlSpaceConfig config = bucket.getConfig();
            for (CrawlRec cr : crawlRecs) {
                CURL url = cr.getURL();
                if (!url.isValid()) {
                    if (tracer.isLoggable(Level.FINE)) {
                        tracer.log(Level.FINE, "An invalid URL \"" + url.badString() + "\" was found in the database for the host name " + bucket.getHostname() + ".");
                    }
                    cr.setNewHTTPStatus(786);
                    cr.metadataProcessing();
                    cr.setWriteRDS(false);
                    sorter.processCrawlResults(cr);
                    continue;
                }
                if (canVerify) {
                    short urlHTTPStatus = bucket.verifyURL(cr.getURL());
                    cr.setVerified(true);
                    if (urlHTTPStatus != 0) {
                        cr.setNewHTTPStatus(urlHTTPStatus);
                        cr.metadataProcessing();
                        cr.setWriteRDS(cr.getOldHTTPStatus() != urlHTTPStatus);
                        cr.setNoFollow(true);
                        cr.setNoIndex(false);
                        cr.setBucket(bucket);
                        sorter.processCrawlResults(cr);
                        continue;
                    }
                }
                if (cr.getCrawlDate() == 0) {
                    this.newList.add(cr);
                } else {
                    this.oldList.add(cr);
                }
                if ((this.newList.size() < newLimit || this.oldList.size() < oldLimit) && canVerify) continue;
                break;
            }
            bucket.setVolatile(this.newList.size() + this.oldList.size() > bucket.getMaxURL() / 2);
            boolean bucketFull = false;
            Iterator<CrawlRec> newItr = this.newList.iterator();
            Iterator<CrawlRec> oldItr = this.oldList.iterator();
            for (int j = 0; newItr.hasNext() && j < newLimit && !bucketFull; ++j) {
                bucketFull = !bucket.addCrawlRec(newItr.next());
            }
            while (oldItr.hasNext() && !bucketFull) {
                bucketFull = !bucket.addCrawlRec(oldItr.next());
            }
            while (newItr.hasNext() && !bucketFull) {
                bucketFull = !bucket.addCrawlRec(newItr.next());
            }
            newItr = null;
            oldItr = null;
            if (tracer.isLoggable(Level.FINER)) {
                tracer.finer("URLScanner  " + bucket + " has " + bucket.size() + " URLs " + bucketFull);
            }
            if (bucket.size() > 0 && config.isValid()) {
                int timeout = bucket.getTimeout();
                if (bucket.isVolatile()) {
                    timeout *= 2;
                }
                bucket.startCrawling(timeout);
            } else {
                bucket.skipCrawling(crawlRecs.size() == 0);
            }
        }
        catch (SQLException sqle) {
            tracer.throwing(this.getClass().getName(), "_clientRunCore()", sqle);
        }
        finally {
            this.newList.clear();
            this.oldList.clear();
        }
    }

    private void sleep(int i) throws InterruptedException {
        this.setSubRunLevel("Sleeping");
        Thread.sleep(i);
    }

    public static short verifyURL(CURL url, Bucket bucket, WebSpace webspace) {
        int proto = url.getProtocolNumber();
        int port = url.getPort();
        if (proto != 0 && proto != 1) {
            return 770;
        }
        if (port != IPProto.getStandardPort(proto) && port <= 1000) {
            return 770;
        }
        if (bucket.robotsExcludes(url)) {
            return 740;
        }
        if (webspace.isExcluded(url)) {
            return 780;
        }
        if (!webspace.isAllowed(url)) {
            return 760;
        }
        return 0;
    }
}

