/*
 * Decompiled with CFR 0.152.
 */
package com.ibm.es.nuvo.crawler.web.http;

import com.ibm.es.nuvo.common.ExtendedException;
import com.ibm.es.nuvo.common.Message;
import com.ibm.es.nuvo.crawler.web.bucket.Bucket;
import com.ibm.es.nuvo.crawler.web.bucket.DNSResolver;
import com.ibm.es.nuvo.crawler.web.bucket.FederatedBucketManager;
import com.ibm.es.nuvo.crawler.web.bucket.RobotsResolver;
import com.ibm.es.nuvo.crawler.web.config.FormBaseAuthenticator;
import com.ibm.es.nuvo.crawler.web.configuration.space.SoftErrorConfig;
import com.ibm.es.nuvo.crawler.web.cookie.CookieEntry;
import com.ibm.es.nuvo.crawler.web.db.CrawlRec;
import com.ibm.es.nuvo.crawler.web.db.tables.CookieTable;
import com.ibm.es.nuvo.crawler.web.error.GenericException;
import com.ibm.es.nuvo.crawler.web.error.OperationFailedException;
import com.ibm.es.nuvo.crawler.web.error.WCException;
import com.ibm.es.nuvo.crawler.web.http.Client;
import com.ibm.es.nuvo.crawler.web.http.URLSorter;
import com.ibm.es.nuvo.crawler.web.net.CURL;
import com.ibm.es.nuvo.crawler.web.parser.doc.Field;
import com.ibm.es.nuvo.crawler.web.parser.doc.HTMLParsedDocument;
import com.ibm.es.nuvo.crawler.web.parser.doc.Link;
import com.ibm.es.nuvo.crawler.web.parser.html.stream.HtmlStreamParser;
import com.ibm.es.nuvo.crawler.web.parser.javascript.JavaScriptHandler;
import com.ibm.es.nuvo.crawler.web.thread.WCRunnableImpl;
import com.ibm.es.nuvo.crawler.web.util.StreamUtils;
import com.ibm.es.nuvo.crawler.web.util.UnixTime;
import com.ibm.es.nuvo.logging.ExtendedLogger;
import com.ibm.es.nuvo.logging.Loggers;
import com.ibm.supa.config.AnalysisScopeConfig;
import com.ibm.supa.config.ConfigurationLoader;
import com.ibm.supa.web.RobotsMetaTagHandler;
import com.ibm.supa.web.WebConfig;
import java.io.CharConversionException;
import java.io.InputStream;
import java.net.InetSocketAddress;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.logging.Level;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class UserAgent
extends WCRunnableImpl {
    private static final String copyright = "IBM Confidential OCO Source Materials 5724-R21 \u00a9 Copyright IBM Corp.  2006, 2007.   All Rights Reserved. The source code for this program is not published or otherwise divested of its trade secrets, irrespective of what has been deposited with the U.S. Copyright Office.";
    private static ExtendedLogger logger = Loggers.logger;
    private static final ExtendedLogger tracer = ExtendedLogger.getLogger("NuvoTracer." + UserAgent.class.getName());
    private static HashSet<String> pureTextExtentions;
    private static HashSet<String> nonHtmlExtentions;
    private static final String[] PURE_TEXT_EXTENSIONS;
    private static final String[] NON_HTML_EXTENSIONS;
    private boolean isNotHTML;
    private boolean isText;
    private boolean m_active = true;
    private long idle;
    private long active;
    private AgentStatus status = AgentStatus.IDLE;
    private Lock lock = new ReentrantLock();
    private long last = System.currentTimeMillis();
    private CURL m_url;

    public UserAgent(int serial) {
        super("UA" + serial);
        this.setSubRunLevel("WAITING");
    }

    public boolean isActive() {
        return this.m_active;
    }

    public void setActive(boolean active) {
        this.m_active = active;
    }

    @Override
    public void _clientInit() throws WCException {
        this._assignThread("Ant", this.getName());
    }

    public double getLoad() {
        long total = this.idle + this.active;
        if (total == 0L) {
            return 0.0;
        }
        return (double)this.active / (double)total;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    @Override
    public void _clientRun() throws WCException {
        while (!this._shouldStopLooping()) {
            CrawlRec cr = null;
            boolean valid = true;
            try {
                this.changeStatus(AgentStatus.IDLE);
                if (this.m_active) {
                    this.setSubRunLevel("WAITING");
                    cr = FederatedBucketManager.instance().getCrawlRec();
                } else {
                    Thread.sleep(100L);
                }
                if (cr == null || cr.getConfig() == null || !cr.getConfig().isValid()) {
                    valid = false;
                    if (cr == null) continue;
                    cr.deregister();
                    continue;
                }
                if (tracer.isLoggable(Level.FINE)) {
                    tracer.fine("Start crawling " + cr.getURL().downstreamString());
                }
                this.changeStatus(AgentStatus.WORKING);
                if (!cr.isVerified()) {
                    Bucket bucket = cr.getBucket();
                    if (cr.getConfig().isValid() && bucket.needsDNSUpdate()) {
                        this.setSubRunLevel("DNS");
                        DNSResolver.instance().resolve(bucket, this.lock);
                    }
                    if (cr.getConfig().isValid() && bucket.getDNSFailCount() == 0 && bucket.needsRobotsUpdate()) {
                        this.setSubRunLevel("ROBOTS");
                        RobotsResolver.instance().downloadRobots(bucket, this.lock);
                        if (bucket.getRobotsFailCount() == 0 && cr.getConfig().isValid()) {
                            bucket.getManager().setRobotsTxt(bucket);
                        }
                    }
                    valid = false;
                    continue;
                }
                this.setSubRunLevel("FETCHING");
                if (!cr.getConfig().isValid()) {
                    valid = false;
                    continue;
                }
                this.doFormeBaseAuthentication(cr);
                if (!cr.getConfig().isValid()) {
                    valid = false;
                    continue;
                }
                this._downloadAndParse(cr);
                if (!cr.isTruncated() || this.isTextFile(cr)) continue;
                cr.setWriteRDS(false);
                cr.setNewHTTPStatus(2001);
            }
            catch (WCException wce) {
                if (tracer.isLoggable(Level.FINE)) {
                    tracer.log(Level.FINE, "clientRun()", wce);
                }
                throw wce;
            }
            catch (InterruptedException ie) {
                if (tracer.isLoggable(Level.FINER)) {
                    tracer.log(Level.FINER, "clientRun()", ie);
                }
                this.setSubRunLevel("COMPLETED");
            }
            finally {
                if (cr != null) {
                    this.setSubRunLevel("SAVING");
                    if (!Thread.currentThread().isInterrupted() && valid) {
                        URLSorter.instance().processCrawlResults(cr, this.lock);
                    }
                    cr.deregister();
                }
                this.setSubRunLevel("WAITING");
            }
        }
    }

    private void changeStatus(AgentStatus newStatus) {
        if (newStatus == this.status) {
            return;
        }
        long now = System.currentTimeMillis();
        if (newStatus == AgentStatus.IDLE) {
            this.active += now - this.last;
        } else {
            this.idle += now - this.last;
        }
        this.status = newStatus;
        this.last = now;
    }

    private void _collectDocumentContent(CrawlRec cr, Client client) {
        cr.setContent(client.getContent());
    }

    private void _collectHTTPHeaderProperties(CrawlRec cr, Client client) {
        Properties hp = client.getHTTPHeaderProperties();
        cr.setHeaderProperties(hp);
        if (hp == null) {
            return;
        }
        String value = null;
        value = hp.getProperty("content-language");
        if (value != null) {
            cr.setContentLanguage(value);
        }
        if ((value = hp.getProperty("date")) != null) {
            cr.setServerDate(value);
        }
        if ((value = hp.getProperty("expires")) != null) {
            cr.setExpirationDate(value);
        }
        if ((value = hp.getProperty("last-modified")) != null) {
            cr.setLastModifiedDate(value);
        }
        if ((value = hp.getProperty("server")) != null) {
            cr.setServerID(value);
        }
        if ((value = hp.getProperty("location")) != null && (value = value.trim()) != null && value.length() > 0) {
            CURL ru = new CURL(value);
            if (!ru.isValid() && ru.mayBeRelative()) {
                ru = new CURL(cr.getURL(), value);
            }
            if (ru.isValid()) {
                cr.setRedirectURL(ru);
            }
        }
    }

    private void classifyURLByExtension() {
        String path = this.m_url.getPath();
        this.isNotHTML = false;
        this.isText = false;
        if (path == null || path.endsWith("/")) {
            return;
        }
        int ixdot = path.lastIndexOf(46);
        if (ixdot < 0 || ixdot + 1 >= path.length()) {
            return;
        }
        String extention = path.substring(ixdot + 1);
        this.isText = pureTextExtentions.contains(extention);
        this.isNotHTML = !this.isText ? nonHtmlExtentions.contains(extention) : false;
    }

    private boolean _doHTMLParse() {
        return !this.isNotHTML;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private void _downloadAndParse(CrawlRec cr) throws WCException {
        if (tracer.isLoggable(Level.FINER)) {
            tracer.entering(this.getClass().getName(), "_downloadAndParse(CrawlRec)", new Object[]{cr});
        }
        try {
            Client client = new Client(cr);
            client.download(this.lock);
            this.m_url = cr.getURL();
            this.classifyURLByExtension();
            cr.setCrawlDate(UnixTime.now());
            String[] rv = client.getContentTypeAndEncoding();
            boolean forceParse = false;
            if (rv != null) {
                cr.setContentTypeLC(rv[0]);
                if (rv[0] != null && rv[0].startsWith("text/")) {
                    forceParse = true;
                }
                cr.setKnownCharset(rv[1]);
            }
            if (client.getHTTPHeader() == null || client.getHTTPReturnCode() < 0) {
                client.setForcedHTTPReturnCode((short)615);
            }
            cr.setNewHTTPStatus(client.getHTTPReturnCode());
            cr.setHTTPBasicAuthACLString(client.getHTTPBasicAuthACLString());
            cr.setTruncated(client.isTruncated() && !forceParse);
            this._collectHTTPHeaderProperties(cr, client);
            this._collectDocumentContent(cr, client);
            this.setSubRunLevel("PARSING");
            HTMLParsedDocument pd = null;
            if (client.getContent() != null && (this._doHTMLParse() || forceParse)) {
                if (tracer.isLoggable(Level.FINER)) {
                    tracer.finer("HTML parsing: " + this.m_url);
                }
                InputStream is = null;
                try {
                    is = client.getContent().getInputStream();
                    try {
                        pd = HtmlStreamParser.parse(is, this.m_url.toString(), cr.getEncoding());
                    }
                    catch (CharConversionException e) {
                        StreamUtils.safeClose(is);
                        is = client.getContent().getInputStream();
                        pd = HtmlStreamParser.parse(is, this.m_url.toString(), "iso8859-1");
                    }
                }
                finally {
                    StreamUtils.safeClose(is);
                }
                if (tracer.isLoggable(Level.FINEST)) {
                    tracer.finest("Links " + pd.getLinks());
                    tracer.finest("Frame " + pd.frames());
                    tracer.finest("Redirect " + pd.getRedirectURL());
                }
                if (pd != null) {
                    cr.setEncoding(pd.getEncoding());
                    cr.setRss(pd.isFeed());
                    cr.setHtml(pd.isHtml());
                    cr.setLongHash(pd.getHash());
                    this._handleBaseNRedirectURL(cr, pd.getBase(), pd.getRedirectURL());
                    this._handleHTMLDocumentOutLinks(cr, pd.getLinks(), pd.frames());
                    this._handleMetaTags(cr, pd);
                    this._handleSoftErrorPage(cr, client, pd.getTitle());
                    this.handleJavaSciptScan(cr, pd.getLinks());
                    pd = null;
                }
            } else if (tracer.isLoggable(Level.FINER)) {
                tracer.finer("HTML not parsed: " + this.m_url + " " + this._doHTMLParse());
            }
            cr.metadataProcessing();
            cr.getConfig().getRule().handleDomino(cr);
        }
        catch (WCException wce) {
            cr.handleDownloadException(wce);
        }
        catch (Exception e) {
            throw new OperationFailedException("DP " + this.m_url, (Throwable)e);
        }
        finally {
            this.m_url = null;
        }
    }

    private void _handleBaseNRedirectURL(CrawlRec cr, String baseUrl, String redirectUrl) {
        if (baseUrl != null) {
            CURL bu = new CURL(baseUrl.trim());
            if (bu.isValid()) {
                cr.setBaseURL(bu);
            } else if (tracer.isLoggable(Level.FINE)) {
                tracer.log(Level.FINE, "Invalid base URL " + bu);
            }
        } else {
            return;
        }
        String redirectURLString = redirectUrl;
        if (redirectURLString != null) {
            redirectURLString = redirectURLString.trim();
        }
        if (redirectURLString != null) {
            CURL ru = new CURL(redirectURLString);
            CURL ru2 = null;
            if (!ru.isValid() && ru.mayBeRelative()) {
                if (cr.getBaseURL() != null) {
                    ru2 = new CURL(cr.getBaseURL(), redirectURLString);
                }
                if (ru2 == null || !ru2.isValid()) {
                    ru2 = new CURL(cr.getURL(), redirectURLString);
                }
                if (ru2 == null || !ru2.isValid()) {
                    if (logger.isLoggable(Level.INFO)) {
                        logger.log(Level.INFO, "C5008I.INVALID_REDIRECT_URL", new Object[]{ru, cr.getURL().toString()});
                    }
                } else {
                    ru = ru2;
                }
            }
            if (ru == null || ru.isValid()) {
                cr.setRedirectURL(ru);
            }
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private void doFormeBaseAuthentication(CrawlRec cr) {
        if (tracer.isLoggable(Level.FINER)) {
            tracer.entering(this.getClass().getName(), "_handleFBA2(CrawlRec)", new Object[]{cr});
        }
        CURL url = cr.getURL();
        FormBaseAuthenticator fba = cr.getConfig().getAgent().getFormBaseAuthenticator(url);
        if (fba == null) {
            return;
        }
        Bucket bucket = cr.getBucket();
        ReentrantLock fbaLock = bucket.getFbaLock();
        try {
            fbaLock.lock();
            if (bucket.isFBADone(fba.getDefID())) {
                return;
            }
            HTMLParsedDocument pd = null;
            CrawlRec rootCrawlRec = null;
            CrawlRec fbaCrawlRec = null;
            int maxRedirect = 2;
            try {
                CURL fbaURL;
                CURL logonURL = url;
                CookieTable.getInstance(cr.getCrawlerId()).ignoreCookie(logonURL);
                while (maxRedirect-- > 0) {
                    Client rootClient;
                    InputStream is;
                    block34: {
                        if (tracer.isLoggable(Level.FINER)) {
                            tracer.finer("trying root page " + logonURL);
                        }
                        is = null;
                        rootClient = null;
                        try {
                            rootCrawlRec = new CrawlRec();
                            rootCrawlRec.setURL(logonURL);
                            rootCrawlRec.setBucket(bucket);
                            rootClient = new Client(rootCrawlRec);
                            rootClient.download(this.lock);
                            CURL redirectURL = rootClient.getRedirectURL();
                            if (redirectURL == null) break block34;
                            logonURL = redirectURL;
                        }
                        catch (Throwable throwable) {
                            StreamUtils.safeClose(is);
                            if (rootClient != null) {
                                rootClient.clear();
                            }
                            pd = null;
                            if (rootCrawlRec != null) {
                                rootCrawlRec.deregister();
                            }
                            throw throwable;
                        }
                        StreamUtils.safeClose(is);
                        if (rootClient != null) {
                            rootClient.clear();
                        }
                        pd = null;
                        if (rootCrawlRec == null) continue;
                        rootCrawlRec.deregister();
                        continue;
                    }
                    String[] rv = rootClient.getContentTypeAndEncoding();
                    rootCrawlRec.setEncoding(rv[1]);
                    is = rootClient.getContent().getInputStream();
                    pd = HtmlStreamParser.parse(is, logonURL.toString(), rootCrawlRec.getEncoding());
                    this._handleMetaTags(rootCrawlRec, pd);
                    StreamUtils.safeClose(is);
                    if (rootClient != null) {
                        rootClient.clear();
                    }
                    pd = null;
                    if (rootCrawlRec != null) {
                        rootCrawlRec.deregister();
                    }
                    if (!tracer.isLoggable(Level.FINEST)) break;
                    tracer.finest("root content\n" + rootClient.getContentString());
                    break;
                }
                if (tracer.isLoggable(Level.FINER)) {
                    tracer.finer("submitting FBA form " + fba);
                }
                if (!(fbaURL = fba.getFormActionURL(url)).isValid()) {
                    throw new GenericException("Invalid URL " + fbaURL);
                }
                Bucket fbaBucket = bucket.getManager().bucketForURL(fbaURL);
                if (fbaBucket == null) {
                    throw new GenericException("No bucket for " + fbaURL);
                }
                Iterator<InetSocketAddress> it = fbaBucket.getIP();
                if (!it.hasNext()) {
                    DNSResolver.instance().resolve(fbaBucket, this.lock);
                    it = fbaBucket.getIP();
                    if (!it.hasNext()) {
                        throw new GenericException("Has no valid IP for " + fbaURL.getHostname());
                    }
                }
                fbaCrawlRec = new CrawlRec();
                fbaCrawlRec.setURL(fbaURL);
                fbaCrawlRec.setBucket(fbaBucket);
                Client fbaClient = null;
                try {
                    fbaClient = new Client(fbaCrawlRec);
                    fbaClient.setFBAJob(fba);
                    fbaClient.setReferer(url);
                    fbaClient.download(this.lock);
                }
                finally {
                    if (fbaClient != null) {
                        fbaClient.clear();
                    }
                }
                if (tracer.isLoggable(Level.FINEST)) {
                    tracer.finest("FBA content\n" + fbaClient.getContentString());
                }
                bucket.setFBARecord(fba.getDefID());
                if (fbaCrawlRec != null) {
                    fbaCrawlRec.deregister();
                }
            }
            catch (Throwable e) {
                try {
                    Message message = new Message("C4912W.FBA_FAIL", url.toString());
                    ExtendedException exception = new ExtendedException(message, e);
                    logger.log(Level.WARNING, exception);
                }
                catch (Throwable throwable) {
                    throw throwable;
                }
                finally {
                    if (fbaCrawlRec != null) {
                        fbaCrawlRec.deregister();
                    }
                }
            }
        }
        finally {
            fbaLock.unlock();
        }
    }

    private void _handleMetaTags(CrawlRec cr, HTMLParsedDocument pd) {
        for (Field field : pd.getFields()) {
            RobotsMetaTagHandler handler;
            String collectionId;
            AnalysisScopeConfig config;
            WebConfig webConfig;
            int j;
            String fieldName = field.getName();
            String[] fieldValues = field.getValues();
            if (fieldValues == null || fieldValues.length == 0) break;
            if (fieldName.equals("set-cookie")) {
                for (int i = 0; i < fieldValues.length; ++i) {
                    CookieTable.getInstance(cr.getCrawlerId()).update(CookieEntry.newEntry(cr.getURL(), fieldValues[i]));
                }
                continue;
            }
            if (fieldName.equals("set-cookie2")) {
                logger.log(Level.INFO, "C5022I.UNSUPPORTED_COOKIE_FOUND");
                continue;
            }
            StringBuilder thisField = new StringBuilder();
            thisField.append(fieldName);
            thisField.append("=");
            if (fieldValues != null) {
                for (j = 0; j < fieldValues.length; ++j) {
                    thisField.append(fieldValues[j]).append(j < fieldValues.length - 1 ? "," : "");
                }
            }
            if (cr.getConfig().getAcl().isACL(fieldName)) {
                for (j = 0; j < fieldValues.length; ++j) {
                    cr.addSecurityACLs(fieldValues[j]);
                }
            }
            if (!fieldName.equalsIgnoreCase("robots")) continue;
            String thisFieldLC = thisField.toString().toLowerCase();
            if (thisFieldLC.indexOf("none") > 0) {
                cr.setNoFollow(true);
                cr.setNoIndex(true);
            } else {
                if (thisFieldLC.indexOf("noindex") > 0) {
                    cr.setNoIndex(true);
                    cr.setWriteRDS(false);
                }
                if (thisFieldLC.indexOf("nofollow") > 0) {
                    cr.setNoFollow(true);
                }
            }
            if ((webConfig = (config = ConfigurationLoader.getAnalysisScopeConfig(collectionId = cr.getConfig().getCollectionId())).getWebConfig()) == null || (handler = webConfig.getRobotsMetaTagHandler()) == null) continue;
            handler.handle(thisFieldLC, cr);
        }
        cr.setMetadataFields(pd.getFields());
    }

    private void _handleHTMLDocumentOutLinks(CrawlRec cr, List<Link> links, List<String> frames) {
        ArrayList<String> outLinks = new ArrayList<String>();
        if (links != null) {
            for (Link link : links) {
                String linkSpec;
                String rel;
                if (!link.isFollowed() || (rel = link.getRelativeAddress()) == null || (linkSpec = rel.trim()).length() <= 0) continue;
                outLinks.add(linkSpec);
            }
        }
        if (frames != null) {
            for (String linkSpec : frames) {
                if (linkSpec == null || linkSpec.length() <= 0) continue;
                outLinks.add(linkSpec);
            }
        }
        cr.setOutLinks(outLinks);
    }

    private void _handleSoftErrorPage(CrawlRec cr, Client client, String title) {
        try {
            short code;
            SoftErrorConfig softErrorConfig = cr.getConfig().getRule().getSoftErrorConfig();
            try {
                code = softErrorConfig.getReplacementCode(cr.getURL(), title, client.getContent(), cr.getEncoding());
            }
            catch (CharConversionException e) {
                code = softErrorConfig.getReplacementCode(cr.getURL(), title, client.getContent(), "ISO8859-1");
            }
            if (code > 0) {
                if (tracer.isLoggable(Level.FINE)) {
                    tracer.finer(MessageFormat.format("ReplacementCode for url {0} is {1}.", cr.getURL().toString(), code));
                }
                cr.setNewHTTPStatus(code);
                cr.setContent(null);
            }
        }
        catch (Exception e) {
            logger.log(Level.WARNING, "C4926W.SOFT_ERROR_FAIL", e);
        }
    }

    private boolean isTextFile(CrawlRec cr) {
        if (cr.getContentTypeLC() != null && !cr.getContentTypeLC().startsWith("text/")) {
            return false;
        }
        return this.isText;
    }

    private void handleJavaSciptScan(CrawlRec cr, List<Link> knownLinks) {
        ArrayList<String> jsOutLinks;
        List<CURL> outLinks = cr.getOutLinks();
        if (outLinks == null) {
            outLinks = new ArrayList<CURL>();
        }
        if ((jsOutLinks = JavaScriptHandler.getLinks(this.m_url, cr.getBaseURL(), cr.getContent(), cr.getEncoding())) != null) {
            ArrayList<Object> list = new ArrayList<Object>();
            for (Link link : knownLinks) {
                jsOutLinks.remove(link.getRelativeAddress());
            }
            if (jsOutLinks.size() > 0) {
                list.addAll(outLinks);
                list.addAll(jsOutLinks);
                cr.setOutLinks(list);
            }
        }
    }

    public AgentStatus getStatus() {
        return this.status;
    }

    @Override
    protected void _clientStop() throws Exception {
    }

    public Lock getInterruptionLock() {
        return this.lock;
    }

    static {
        PURE_TEXT_EXTENSIONS = new String[]{"htm", "html", "txt"};
        NON_HTML_EXTENSIONS = new String[]{"doc", "pdf", "prz", "ps", "ppt", "zip", "tar", "tgz", "gz"};
        pureTextExtentions = new HashSet();
        pureTextExtentions.addAll(Arrays.asList(PURE_TEXT_EXTENSIONS));
        nonHtmlExtentions = new HashSet();
        nonHtmlExtentions.addAll(Arrays.asList(NON_HTML_EXTENSIONS));
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    public static enum AgentStatus {
        WORKING,
        IDLE;

    }
}

