package org.carrot2.text.linguistic.lucene;

import java.io.IOException;
import java.io.Reader;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.ITokenizerFactory;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.ExceptionUtils;

/* loaded from: input_file:org/carrot2/text/linguistic/lucene/ChineseSimplifiedTokenizerFactory.class */
public class ChineseSimplifiedTokenizerFactory implements ITokenizerFactory {

    /* loaded from: input_file:org/carrot2/text/linguistic/lucene/ChineseSimplifiedTokenizerFactory$TokenizerAdapter.class */
    private static final class TokenizerAdapter implements ITokenizer {
        private static final Pattern numeric = Pattern.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
        private Tokenizer sentenceTokenizer;
        private TokenStream wordTokenFilter;
        private TermAttribute term;
        private final MutableCharArray tempCharSequence;

        private TokenizerAdapter() {
            this.term = null;
            this.tempCharSequence = new MutableCharArray(new char[0]);
            this.sentenceTokenizer = new SentenceTokenizer((Reader) null);
        }

        @Override // org.carrot2.text.analysis.ITokenizer
        public short nextToken() throws IOException {
            if (!this.wordTokenFilter.incrementToken()) {
                return (short) -1;
            }
            char[] termBuffer = this.term.termBuffer();
            int termLength = this.term.termLength();
            this.tempCharSequence.reset(termBuffer, 0, termLength);
            return (termLength == 1 && termBuffer[0] == ',') ? (short) 3 : numeric.matcher(this.tempCharSequence).matches() ? (short) 2 : (short) 1;
        }

        @Override // org.carrot2.text.analysis.ITokenizer
        public void setTermBuffer(MutableCharArray mutableCharArray) {
            mutableCharArray.reset(this.term.termBuffer(), 0, this.term.termLength());
        }

        @Override // org.carrot2.text.analysis.ITokenizer
        public void reset(Reader reader) throws IOException {
            try {
                this.sentenceTokenizer.reset(reader);
                this.wordTokenFilter = new WordTokenFilter(this.sentenceTokenizer);
                this.term = this.wordTokenFilter.addAttribute(TermAttribute.class);
            } catch (Exception e) {
                throw ExceptionUtils.wrapAsRuntimeException(e);
            }
        }
    }

    @Override // org.carrot2.text.linguistic.ITokenizerFactory
    public ITokenizer createInstance() {
        return new TokenizerAdapter();
    }
}
