/*
 * Decompiled with CFR 0.152.
 */
package com.ibm.es.nuvo.parser.html;

import com.ibm.es.nuvo.common.Metadata;
import com.ibm.es.nuvo.parser.AbstractTextParser;
import com.ibm.es.nuvo.parser.MalformedEncodingException;
import com.ibm.es.nuvo.parser.MalformedFormatException;
import com.ibm.es.nuvo.parser.ParserException;
import com.ibm.es.nuvo.parser.ParserHandler;
import com.ibm.es.nuvo.parser.ParserIOException;
import com.ibm.es.nuvo.parser.SAXHandler;
import com.ibm.es.nuvo.parser.SAXHandlerException;
import com.ibm.es.nuvo.parser.html.AnchorCleanser;
import com.ibm.es.nuvo.parser.html.HTMLCharsetDetector;
import com.ibm.es.nuvo.parser.html.HTMLCleanser;
import java.io.CharConversionException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;

public class HTMLParser
extends AbstractTextParser {
    private static final int DEFAULT_BUFFER_SIZE = 16384;
    private static final String PARAM_BUFFER_SIZE = "buffer.size";
    private static final String copyright = "IBM Confidential OCO Source Materials 5724-R21 \u00a9 Copyright IBM Corp.  2006, 2007.   All Rights Reserved. The source code for this program is not published or otherwise divested of its trade secrets, irrespective of what has been deposited with the U.S. Copyright Office.";
    static final String CONTENT_TYPE = "text/html";
    private static final String IGNORE_SPECIFIED_CHARSET = "http://cyberneko.org/html/features/scanner/ignore-specified-charset";
    private static final String DEFAULT_ENCODING = "http://cyberneko.org/html/properties/default-encoding";
    private SAXHandler handler;
    private XMLReader parser;
    private HTMLCharsetDetector detector;
    private AbstractTextParser.PlaybackInputStream.Buffer buffer;
    private String currentContentType = "text/html";

    public void setCurrentContentType(String contentType) {
        this.currentContentType = contentType;
    }

    public String getCurrentContentType() {
        return this.currentContentType;
    }

    public void open(Properties p) {
        super.open(p);
        this.handler = this.createSAXHandler();
        this.detector = new HTMLCharsetDetector();
        this.buffer = new AbstractTextParser.PlaybackInputStream.Buffer(HTMLParser.safeGetParameterAsInt(p, PARAM_BUFFER_SIZE, 16384));
    }

    private static final int safeGetParameterAsInt(Properties params, String name, int defaultValue) {
        String v;
        int ret = defaultValue;
        if (params != null && (v = params.getProperty(name)) != null) {
            try {
                ret = Integer.parseInt(v);
            }
            catch (NumberFormatException ignore) {
                // empty catch block
            }
        }
        return ret;
    }

    public void parseText(String uri, Metadata metadata, InputStream input, ParserHandler h, String detectedCharset) throws ParserException {
        String url = null;
        String charset = null;
        String knownCharset = null;
        if (metadata != null) {
            metadata.set("doctype", this.currentContentType);
            url = metadata.get("URL");
            charset = metadata.get("Charset");
            knownCharset = metadata.get("KnownCharset");
        }
        if (url == null || url.length() == 0) {
            url = uri;
        }
        this.handler.setURL(uri, url);
        this.handler.setMetadata(metadata);
        h = new AnchorCleanser(h);
        h = new HTMLCleanser(h, this.currentContentType);
        this.handler.setParserHandler(h);
        try {
            if (knownCharset != null) {
                this.parser.setFeature(IGNORE_SPECIFIED_CHARSET, true);
                this.parser.setProperty(DEFAULT_ENCODING, knownCharset);
                if (metadata != null) {
                    metadata.set("charset", knownCharset);
                }
                charset = knownCharset;
            } else {
                AbstractTextParser.PlaybackInputStream pin = new AbstractTextParser.PlaybackInputStream(input, this.buffer);
                String metaCharset = this.detector.detect(pin.getBufferedData());
                input = pin;
                charset = metaCharset != null ? metaCharset : charset;
                this.parser.setFeature(IGNORE_SPECIFIED_CHARSET, false);
                if (charset != null) {
                    this.parser.setProperty(DEFAULT_ENCODING, charset);
                    if (metadata != null) {
                        metadata.set("charset", charset);
                    }
                } else {
                    this.parser.setProperty(DEFAULT_ENCODING, "Windows-1252");
                }
            }
            this.parser.parse(charset != null ? new InputSource(this.createReader(input, charset)) : new InputSource(input));
        }
        catch (CharConversionException e) {
            throw new MalformedEncodingException(uri, (Throwable)e);
        }
        catch (IOException e) {
            throw new ParserIOException(uri, (Throwable)e);
        }
        catch (SAXHandlerException e) {
            throw e.getCause();
        }
        catch (SAXException e) {
            throw new MalformedFormatException(uri, (Throwable)e);
        }
        catch (Throwable e) {
            throw new ParserException(uri, e);
        }
    }

    public void close() {
    }
}

