package com.code972.hebmorph;

import com.code972.hebmorph.datastructures.DictRadix;
import com.code972.hebmorph.hspell.HSpellLoader;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;

/* loaded from: input_file:com/code972/hebmorph/Tokenizer.class */
public class Tokenizer {
    private Reader input;
    private int dataLen;
    private int inputOffset;
    private int tokenOffset;
    private int tokenLengthInSource;
    private Character suffixForExactMatch;
    private final HashMap<String, Integer> hebrewPrefixes;
    private final DictRadix<Byte> specialCases;
    private static final int IO_BUFFER_SIZE = 4096;
    private char[] ioBuffer;
    private int ioBufferIndex;
    private final char[] wordBuffer;
    private byte currentTokenLength;
    private int tokenType;
    static final int TOKENIZATION_EXCEPTION_MAX_LENGTH = 25;
    private char[] tokenizationExceptionBuffer;
    public static final char[] Geresh = {'\'', 1523, 8216, 8217, 8219, 65287};
    public static final char[] Gershayim = {'\"', 1524, 8220, 8221, 8223, 10078, 65282};
    public static final char[] Makaf = {'-', 8210, 8211, 8212, 8213, 1470};
    public static final char[] CharsFollowingPrefixes = HebrewUtils.concatenateCharArrays(new char[]{Geresh, Gershayim, Makaf});
    public static final char[] LettersAcceptingGeresh = {1494, 1490, 1509, 1510, 1495};
    private static final Byte dummyData = (byte) 0;

    /* loaded from: input_file:com/code972/hebmorph/Tokenizer$TokenType.class */
    public static class TokenType {
        public static int Hebrew = 1;
        public static int NonHebrew = 2;
        public static int Numeric = 4;
        public static int Mixed = 8;
        public static int Construct = 16;
        public static int Acronym = 32;
        public static int Exact = 64;
        public static int Custom = HSpellLoader.DMask.D_DOUBLE;
    }

    public final int getOffset() {
        return this.tokenOffset;
    }

    public int getLengthInSource() {
        return this.tokenLengthInSource;
    }

    public Character getSuffixForExactMatch() {
        return this.suffixForExactMatch;
    }

    public void setSuffixForExactMatch(Character ch) {
        this.suffixForExactMatch = ch;
    }

    public void addSpecialCase(String str) {
        if (str.length() > TOKENIZATION_EXCEPTION_MAX_LENGTH) {
            throw new IllegalArgumentException("Special tokenization rule must be at most 25 in length");
        }
        if (str.contains(" ")) {
            throw new IllegalArgumentException("Special tokenization rule cannot contain spaces");
        }
        this.specialCases.addNode(str, (String) dummyData);
    }

    public void clearSpecialCases() {
        this.specialCases.clear();
    }

    public static boolean isLegalPrefix(String str, HashMap<String, Integer> hashMap) {
        return hashMap.containsKey(str);
    }

    public static boolean isLegalPrefix(char[] cArr, int i, HashMap<String, Integer> hashMap) {
        return hashMap.containsKey(new String(cArr, 0, i));
    }

    public Tokenizer(Reader reader, HashMap<String, Integer> hashMap) {
        this(reader, hashMap, null);
    }

    public Tokenizer(Reader reader, HashMap<String, Integer> hashMap, DictRadix<Byte> dictRadix) {
        this.dataLen = 0;
        this.inputOffset = 0;
        this.tokenOffset = 0;
        this.tokenLengthInSource = 0;
        this.suffixForExactMatch = null;
        this.ioBuffer = new char[4096];
        this.ioBufferIndex = 0;
        this.wordBuffer = new char[DictionaryLoader.MaxWordLength];
        this.currentTokenLength = (byte) 0;
        this.tokenType = 0;
        this.tokenizationExceptionBuffer = new char[TOKENIZATION_EXCEPTION_MAX_LENGTH];
        this.input = reader;
        this.specialCases = dictRadix != null ? dictRadix : new DictRadix<>(false);
        this.hebrewPrefixes = hashMap;
    }

    private boolean isRecognizedException(char[] cArr, byte b, char c) {
        if (b >= TOKENIZATION_EXCEPTION_MAX_LENGTH) {
            return false;
        }
        System.arraycopy(cArr, 0, this.tokenizationExceptionBuffer, 0, b);
        this.tokenizationExceptionBuffer[b] = c;
        return isRecognizedException(this.tokenizationExceptionBuffer, b + 1, (byte) (b + 1));
    }

    private boolean isRecognizedException(char c) {
        this.tokenizationExceptionBuffer[0] = c;
        return isRecognizedException(this.tokenizationExceptionBuffer, 1, (byte) 1);
    }

    private boolean isRecognizedException(char[] cArr, int i, byte b) {
        return isRecognizedException(cArr, i, b, false);
    }

    private boolean isRecognizedException(char[] cArr, int i, byte b, boolean z) {
        int i2 = 0;
        while (i2 < i && HebrewUtils.isHebrewLetter(cArr[i2])) {
            if (!isLegalPrefix(cArr, i2 + 1, this.hebrewPrefixes)) {
                i2 = 0;
                break;
            }
            i2++;
        }
        try {
            this.specialCases.lookup(cArr, i2, b - i2, i2, !z);
            return true;
        } catch (IllegalArgumentException e) {
            return false;
        }
    }

    /* JADX WARN: Type inference failed for: r1v112, types: [T, java.lang.String] */
    public int nextToken(Reference<String> reference) throws IOException {
        char collapseAlternate;
        this.currentTokenLength = (byte) 0;
        this.tokenOffset = 0;
        this.tokenType = 0;
        boolean z = false;
        while (true) {
            if (this.ioBufferIndex >= this.dataLen) {
                this.inputOffset += this.dataLen;
                this.dataLen = this.input.read(this.ioBuffer, 0, this.ioBuffer.length);
                if (this.dataLen <= 0) {
                    this.dataLen = 0;
                    if ((this.tokenType & TokenType.Custom) > 0 && this.currentTokenLength > 0 && !isRecognizedException(this.wordBuffer, this.wordBuffer.length, this.currentTokenLength, true)) {
                        abortCustomToken();
                    }
                    if (this.currentTokenLength == 0) {
                        reference.ref = "";
                        this.tokenLengthInSource = 0;
                        this.tokenOffset = this.inputOffset;
                        return 0;
                    }
                } else {
                    this.ioBufferIndex = 0;
                }
            }
            char[] cArr = this.ioBuffer;
            int i = this.ioBufferIndex;
            this.ioBufferIndex = i + 1;
            collapseAlternate = HebrewCharacters.collapseAlternate(cArr[i]);
            boolean z2 = false;
            if (this.currentTokenLength == 0) {
                if (HebrewUtils.isHebrewLetter(collapseAlternate)) {
                    if (!HebrewUtils.isFinalHebrewLetter(collapseAlternate)) {
                        this.tokenType |= TokenType.Hebrew;
                        z2 = true;
                    }
                } else if (Character.isLetterOrDigit(collapseAlternate)) {
                    this.tokenType |= TokenType.NonHebrew;
                    if (Character.isDigit(collapseAlternate)) {
                        this.tokenType |= TokenType.Numeric;
                    }
                    z2 = true;
                } else if (!z && !Character.isWhitespace(collapseAlternate) && isRecognizedException(collapseAlternate)) {
                    this.tokenType |= TokenType.Custom;
                    z2 = true;
                }
            } else if (!z && (this.tokenType & TokenType.Custom) > 0 && !Character.isSpaceChar(collapseAlternate)) {
                this.wordBuffer[this.currentTokenLength] = collapseAlternate;
                if (isRecognizedException(this.wordBuffer, this.wordBuffer.length, (byte) (this.currentTokenLength + 1))) {
                    z2 = true;
                } else {
                    if (!Character.isLetterOrDigit(collapseAlternate)) {
                        break;
                    }
                    this.tokenType &= TokenType.Custom ^ (-1);
                    z = true;
                    this.ioBufferIndex--;
                    if (this.ioBufferIndex >= this.currentTokenLength) {
                        this.ioBufferIndex -= this.currentTokenLength;
                        this.currentTokenLength = (byte) 0;
                    } else {
                        abortCustomToken();
                    }
                }
            } else if (HebrewUtils.isHebrewLetter(collapseAlternate) || HebrewUtils.isNiqqudChar(collapseAlternate)) {
                z2 = true;
            } else if (Character.isLetterOrDigit(collapseAlternate)) {
                if (this.tokenType == TokenType.Hebrew) {
                    this.tokenType |= TokenType.Mixed;
                }
                z2 = true;
            } else if (HebrewUtils.isOfChars(collapseAlternate, Gershayim)) {
                collapseAlternate = '\"';
                if (!HebrewUtils.isHebrewLetter(this.wordBuffer[this.currentTokenLength - 1]) && !HebrewUtils.isNiqqudChar(this.wordBuffer[this.currentTokenLength - 1])) {
                    break;
                }
                this.tokenType |= TokenType.Acronym;
                z2 = true;
            } else if (HebrewUtils.isOfChars(collapseAlternate, Geresh)) {
                collapseAlternate = '\'';
                if ((this.tokenType & TokenType.Hebrew) > 0 && !HebrewUtils.isHebrewLetter(this.wordBuffer[this.currentTokenLength - 1]) && !HebrewUtils.isNiqqudChar(this.wordBuffer[this.currentTokenLength - 1]) && !HebrewUtils.isOfChars(this.wordBuffer[this.currentTokenLength - 1], Geresh)) {
                    break;
                }
                z2 = true;
            } else {
                if (z || isSuffixForExactMatch(collapseAlternate) || Character.isSpaceChar(collapseAlternate) || !isRecognizedException(this.wordBuffer, this.currentTokenLength, collapseAlternate)) {
                    break;
                }
                this.tokenType |= TokenType.Custom;
                z2 = true;
            }
            if (z2) {
                if (this.currentTokenLength == 0) {
                    this.tokenOffset = (this.inputOffset + this.ioBufferIndex) - 1;
                } else if (this.currentTokenLength == this.wordBuffer.length - 1) {
                }
                if (!HebrewUtils.isOfChars(collapseAlternate, Geresh)) {
                    char[] cArr2 = this.wordBuffer;
                    byte b = this.currentTokenLength;
                    this.currentTokenLength = (byte) (b + 1);
                    cArr2[b] = collapseAlternate;
                } else if (this.wordBuffer[this.currentTokenLength - 1] == collapseAlternate) {
                    this.wordBuffer[this.currentTokenLength - 1] = '\"';
                    this.tokenType |= TokenType.Acronym;
                } else {
                    char[] cArr3 = this.wordBuffer;
                    byte b2 = this.currentTokenLength;
                    this.currentTokenLength = (byte) (b2 + 1);
                    cArr3[b2] = collapseAlternate;
                }
            }
        }
        if (HebrewUtils.isOfChars(collapseAlternate, Makaf)) {
            this.tokenType |= TokenType.Construct;
        } else if (this.suffixForExactMatch != null && this.suffixForExactMatch.equals(Character.valueOf(collapseAlternate))) {
            this.tokenType |= TokenType.Exact;
        }
        if (this.dataLen <= 0) {
            this.tokenLengthInSource = Math.max(this.inputOffset - this.tokenOffset, 0);
        } else {
            this.tokenLengthInSource = Math.max(((this.inputOffset + this.ioBufferIndex) - 1) - this.tokenOffset, 0);
        }
        if (HebrewUtils.isOfChars(this.wordBuffer[this.currentTokenLength - 1], Gershayim)) {
            char[] cArr4 = this.wordBuffer;
            byte b3 = (byte) (this.currentTokenLength - 1);
            this.currentTokenLength = b3;
            cArr4[b3] = 0;
            this.tokenLengthInSource = Math.max(this.tokenLengthInSource - 1, 0);
        }
        if (this.currentTokenLength > 2 && this.wordBuffer[this.currentTokenLength - 1] == '\'' && ((this.tokenType & TokenType.Hebrew) == 0 || !HebrewUtils.isOfChars(this.wordBuffer[this.currentTokenLength - 2], LettersAcceptingGeresh))) {
            char[] cArr5 = this.wordBuffer;
            byte b4 = (byte) (this.currentTokenLength - 1);
            this.currentTokenLength = b4;
            cArr5[b4] = 0;
            this.tokenLengthInSource = Math.max(this.tokenLengthInSource - 1, 0);
        }
        reference.ref = new String(this.wordBuffer, 0, (int) this.currentTokenLength);
        return this.tokenType;
    }

    private void abortCustomToken() {
        int i = 0;
        int i2 = 0;
        boolean z = false;
        while (i2 + i < this.currentTokenLength) {
            if (z || HebrewUtils.isHebrewLetter(this.wordBuffer[i]) || HebrewUtils.isNiqqudChar(this.wordBuffer[i]) || Character.isLetterOrDigit(this.wordBuffer[i])) {
                z = true;
                Character valueOf = Character.valueOf(this.wordBuffer[i2 + i]);
                if (!HebrewUtils.isHebrewLetter(valueOf.charValue()) && !HebrewUtils.isNiqqudChar(valueOf.charValue())) {
                    if (!Character.isLetterOrDigit(valueOf.charValue())) {
                        if (!HebrewUtils.isOfChars(valueOf.charValue(), Gershayim)) {
                            if (!HebrewUtils.isOfChars(valueOf.charValue(), Geresh)) {
                                break;
                            } else {
                                valueOf = '\'';
                            }
                        } else {
                            valueOf = '\"';
                            this.tokenType |= TokenType.Acronym;
                        }
                    } else if (this.tokenType == TokenType.Hebrew) {
                        this.tokenType |= TokenType.Mixed;
                    } else {
                        this.tokenType |= TokenType.NonHebrew;
                    }
                } else {
                    this.tokenType |= TokenType.Hebrew;
                }
                this.wordBuffer[i2] = valueOf.charValue();
                i2++;
            } else {
                i++;
            }
        }
        this.currentTokenLength = (byte) i2;
    }

    private boolean isSuffixForExactMatch(char c) {
        return this.suffixForExactMatch != null && c == this.suffixForExactMatch.charValue();
    }

    public final void reset(Reader reader) {
        this.input = reader;
        this.inputOffset = 0;
        this.dataLen = 0;
        this.ioBufferIndex = 0;
        this.tokenOffset = 0;
        this.tokenLengthInSource = 0;
        this.currentTokenLength = (byte) 0;
        this.tokenType = 0;
    }
}
