/*
 * Decompiled with CFR 0.152.
 */
package com.ibm.es.nuvo.tokenizer.annotators.ngram;

import com.ibm.es.nuvo.tokenizer.annotators.ngram.CharArrayIterator;
import com.ibm.es.nuvo.tokenizer.annotators.ngram.CharClassifier;
import com.ibm.es.nuvo.tokenizer.annotators.ngram.Dictionary;
import com.ibm.es.nuvo.tokenizer.annotators.ngram.NgramTokenizerConfig;
import com.ibm.es.nuvo.tokenizer.annotators.ngram.RotatedCharBuffer;
import java.util.EnumSet;

public class NgramTokenizer {
    private static final String copyright = "IBM Confidential OCO Source Materials 5724-R21 \u00a9 Copyright IBM Corp.  2006, 2007.   All Rights Reserved. The source code for this program is not published or otherwise divested of its trade secrets, irrespective of what has been deposited with the U.S. Copyright Office.";
    private static int[] BLANK = new int[]{32};
    private char boundaryChar = (char)64;
    private CharArrayIterator iterator;
    private CharClassifier cc;
    private RotatedCharBuffer rb;
    private boolean isRuntime;
    private int multiWildcard;
    private TokenHandler handler;

    public NgramTokenizer(NgramTokenizerConfig config) {
        this.boundaryChar = config.getBoundaryChar();
        NgramTokenizerConfig.RuntimeConfig rc = config.getRuntimeConfig();
        this.isRuntime = rc != null;
        EnumSet<NgramTokenizerConfig.CharCategory> categories = config.getNgramCategories();
        EnumSet<CharClassifier.Option> options = EnumSet.noneOf(CharClassifier.Option.class);
        if (categories.contains((Object)NgramTokenizerConfig.CharCategory.ALPHABET)) {
            options.add(CharClassifier.Option.NGRAM_ALPHA);
        }
        if (categories.contains((Object)NgramTokenizerConfig.CharCategory.NUMBER)) {
            options.add(CharClassifier.Option.NGRAM_NUMBER);
        }
        if (categories.contains((Object)NgramTokenizerConfig.CharCategory.OTHER)) {
            options.add(CharClassifier.Option.NGRAM_OTHER);
        }
        if ((categories = config.getIgnoreWhitespaceCategories()).contains((Object)NgramTokenizerConfig.CharCategory.ALPHABET)) {
            options.add(CharClassifier.Option.IGNORE_SPACE_ALPHA);
        }
        if (categories.contains((Object)NgramTokenizerConfig.CharCategory.NUMBER)) {
            options.add(CharClassifier.Option.IGNORE_SPACE_NUMBER);
        }
        if (categories.contains((Object)NgramTokenizerConfig.CharCategory.OTHER)) {
            options.add(CharClassifier.Option.IGNORE_SPACE_OTHER);
        }
        if (config.isIgnoreSentenceBreakers()) {
            options.add(CharClassifier.Option.IGNORE_SENTENCE_BREAKERS);
        }
        if (config.isIgnorePunctuation()) {
            options.add(CharClassifier.Option.IGNORE_PUNCTUATION);
        }
        this.cc = new CharClassifier(options);
        if (rc != null) {
            char singleWildcard = rc.getSingleWildcard();
            this.multiWildcard = rc.getMultiWildcard();
            this.cc.setWildcard(singleWildcard, this.multiWildcard);
        }
        this.iterator = new CharArrayIterator(this.cc);
        this.rb = new RotatedCharBuffer(config.getGram());
        this.handler = new TokenHandler(config.getGram());
    }

    public void process(char[] doc, Handler h, Dictionary dict) {
        if (doc == null || doc.length == 0) {
            return;
        }
        this.iterator.setArray(doc, 0, doc.length, dict);
        this.handler.reset(h, doc);
        this.process(this.handler);
    }

    private void process(TokenHandler tokenHandler) {
        this.rb.reset();
        CharClassifier.Attr stack = null;
        while (stack != null || this.iterator.next()) {
            CharClassifier.Attr attr = stack != null ? stack : this.iterator.getAttr();
            stack = null;
            int i = this.iterator.getIndex();
            if (attr.isNewLine()) {
                tokenHandler.addToken(Type.NEWLINE, this.iterator.getIndex(), this.iterator.getNextIndex(), BLANK, null);
                tokenHandler.addToken(Type.PARAGRAPH, this.iterator.getNextIndex(), this.iterator.getNextIndex(), null, null);
                continue;
            }
            if (attr.isAlpha() || attr.isWildcard()) {
                CharClassifier.Attr attr2;
                int endIndex = this.iterator.getNextIndex();
                EnumSet<Property> props = attr.isWildcard() ? EnumSet.of(Property.HAS_WILDCARD) : null;
                boolean valid = !attr.isWildcard();
                boolean isAbbr = false;
                if (attr.isPunctuation()) {
                    attr2 = !this.iterator.next() ? null : this.iterator.getAttr();
                } else {
                    while (true) {
                        if (!this.iterator.next()) {
                            attr2 = null;
                            break;
                        }
                        attr2 = this.iterator.getAttr();
                        if (attr2.isPunctuation() || attr2.isSentenceEndForce()) break;
                        if (attr2.isSentenceEnd()) {
                            if (isAbbr) {
                                attr2 = CharClassifier.Attr.Alpha;
                            } else {
                                CharClassifier.Attr attr3 = this.iterator.testNext(true);
                                if (attr3 != null && !attr3.isWhitespace()) break;
                            }
                        }
                        if (!attr2.isAlpha() && !attr2.isWildcard()) break;
                        if (attr2.isWildcard()) {
                            if (props == null) {
                                props = EnumSet.of(Property.HAS_WILDCARD);
                            }
                        } else {
                            valid = true;
                        }
                        endIndex = this.iterator.getNextIndex();
                    }
                }
                if (valid) {
                    Type type = Type.WORD;
                    if (attr.isPunctuation()) {
                        type = attr.isMathSymbol() ? Type.MATH_SYMBOL : Type.PUNCTUATION;
                    }
                    tokenHandler.addToken(type, i, endIndex, null, props);
                    if (attr2 != null && attr2.isNgram()) {
                        this.rb.add(this.iterator.getIndex() - 1, this.boundaryChar);
                    }
                }
                stack = attr2;
            } else if (attr.isNgram()) {
                int tokenCount = tokenHandler.getCount();
                RotatedCharBuffer.Buffer ret = this.rb.add(i, this.iterator.getCodePoint());
                if (ret != null) {
                    tokenHandler.addToken(Type.NGRAM, ret.getLocation(), this.iterator.getNextIndex(), ret.getBuffer(), null);
                }
                if (!attr.isSentenceEnd()) {
                    CharClassifier.Attr attr2;
                    attr = null;
                    while (this.iterator.next() && (!(attr = this.iterator.getAttr()).isSentenceEnd() || attr.isSentenceEndForce() || (attr2 = this.iterator.testNext(true)) == null || !attr2.isAlpha() && !attr2.isNgram())) {
                        CharClassifier.Attr next;
                        if (attr.isNgram()) {
                            ret = this.rb.add(this.iterator.getIndex(), this.iterator.getCodePoint());
                            if (ret != null) {
                                tokenHandler.addToken(Type.NGRAM, ret.getLocation(), this.iterator.getNextIndex(), ret.getBuffer(), null);
                            }
                            attr = null;
                            continue;
                        }
                        if (attr.isWildcard() && ((next = this.iterator.testNext(false)) == null || next.isNgram())) {
                            attr = null;
                            continue;
                        }
                        stack = attr;
                        break;
                    }
                }
                if (attr != null && (attr.isAlpha() || attr.isWildcard() || attr.isSentenceEnd())) {
                    ret = this.rb.add(i + 1, this.boundaryChar);
                    if (ret != null) {
                        tokenHandler.addToken(Type.NGRAM, ret.getLocation(), this.iterator.getNextIndex(), ret.getBuffer(), null);
                    }
                    while ((ret = this.rb.get()) != null && ret.getBuffer().length > 1) {
                        tokenHandler.addToken(Type.NGRAM, ret.getLocation(), this.iterator.getNextIndex(), ret.getBuffer(), null);
                    }
                } else if (attr == null && this.isRuntime) {
                    if (tokenHandler.getCount() == tokenCount) {
                        ret = this.rb.add(i, this.multiWildcard);
                        if (ret == null) {
                            ret = this.rb.get();
                        }
                        tokenHandler.addToken(Type.NGRAM, ret.getLocation(), this.iterator.getNextIndex(), ret.getBuffer(), EnumSet.of(Property.HAS_WILDCARD));
                    }
                } else {
                    while ((ret = this.rb.get()) != null) {
                        tokenHandler.addToken(Type.NGRAM, ret.getLocation(), this.iterator.getIndex(), ret.getBuffer(), null);
                    }
                }
            }
            if (attr == null || !attr.isSentenceEnd()) continue;
            CharClassifier.Attr attr2 = this.iterator.testNext(true);
            Type t = Type.SENTENCE;
            if (!attr.isSentenceEndForce() && attr2 != null && !attr2.isWhitespace()) {
                t = Type.PUNCTUATION;
                if (attr2.isNgram()) {
                    this.rb.add(this.iterator.getIndex(), this.boundaryChar);
                }
            }
            tokenHandler.addToken(t, this.iterator.getIndex(), this.iterator.getNextIndex(), null, null);
        }
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    private static class TokenHandler {
        private Handler wrapped;
        private int count;
        private char[] doc;
        private boolean prevSentence = false;
        private boolean prevParagraph = false;
        private char[] buffer;
        private char[] prevCovered;

        TokenHandler(int gram) {
            this.buffer = new char[gram * 2];
        }

        public void reset(Handler wrappedHandler, char[] document) {
            this.wrapped = wrappedHandler;
            this.count = 0;
            this.doc = document;
        }

        public void addToken(Type type, int start, int end, int[] covered, EnumSet<Property> properties) {
            if (end > this.doc.length) {
                end = this.doc.length;
            }
            if (type == Type.SENTENCE || type == Type.NEWLINE) {
                if (this.prevSentence) {
                    return;
                }
                this.prevSentence = true;
            } else if (type == Type.PARAGRAPH) {
                if (this.prevParagraph) {
                    return;
                }
                this.prevParagraph = true;
            } else {
                this.prevSentence = false;
                this.prevParagraph = false;
            }
            char[] coveredText = null;
            if (covered != null) {
                int len = 0;
                for (int i = 0; i < covered.length; ++i) {
                    len += Character.toChars(covered[i], this.buffer, len);
                }
                boolean identical = false;
                if (end - start == len && end <= this.doc.length) {
                    identical = true;
                    for (int i = 0; i < len; ++i) {
                        if (this.doc[start + i] == this.buffer[i]) continue;
                        identical = false;
                        break;
                    }
                }
                if (!identical) {
                    if (this.prevCovered == null || this.prevCovered.length != len) {
                        this.prevCovered = new char[len];
                    }
                    coveredText = this.prevCovered;
                    System.arraycopy(this.buffer, 0, coveredText, 0, len);
                }
            }
            this.wrapped.addToken(type, start, end, coveredText, properties);
            ++this.count;
        }

        public int getCount() {
            return this.count;
        }
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    public static interface Handler {
        public void addToken(Type var1, int var2, int var3, char[] var4, EnumSet<Property> var5);
    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    public static enum Property {
        HAS_WILDCARD;

    }

    /*
     * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
     */
    public static enum Type {
        WORD,
        NGRAM,
        PUNCTUATION,
        SENTENCE,
        NEWLINE,
        PARAGRAPH,
        MATH_SYMBOL;

    }
}

