package net.clementlevallois.umigon.tokenizer.controller;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Pattern;
import net.clementlevallois.umigon.model.Emoji;
import net.clementlevallois.umigon.model.NonWord;
import net.clementlevallois.umigon.model.PatternOfInterest;
import net.clementlevallois.umigon.model.Punctuation;
import net.clementlevallois.umigon.model.Term;
import net.clementlevallois.umigon.model.TextFragment;
import net.clementlevallois.umigon.model.WhiteSpace;
import net.clementlevallois.utils.RepeatedCharactersRemover;
import net.clementlevallois.utils.TextCleaningOps;
import net.fellbaum.jemoji.EmojiManager;

/* loaded from: input_file:net/clementlevallois/umigon/tokenizer/controller/UmigonTokenizer.class */
public class UmigonTokenizer {
    static boolean initialized = false;
    private static PatternOfInterestChecker poiChecker;

    /* loaded from: input_file:net/clementlevallois/umigon/tokenizer/controller/UmigonTokenizer$CurrentFragment.class */
    private enum CurrentFragment {
        CURR_FRAGMENT_IS_WHITE_SPACE,
        CURR_FRAGMENT_IS_PUNCTUATION,
        CURR_FRAGMENT_IS_NON_WORD,
        CURR_FRAGMENT_IS_TERM,
        CURR_FRAGMENT_IS_NOT_STARTED
    }

    public static void initialize() {
        try {
            poiChecker = new PatternOfInterestChecker();
            poiChecker.loadPatternsOfInterest();
            initialized = true;
        } catch (IOException e) {
            System.out.println("error in loading poi checker");
        }
    }

    /* JADX WARN: Failed to find 'out' block for switch in B:24:0x013f. Please report as an issue. */
    public static List<TextFragment> tokenize(String str, Set<String> set) throws IOException {
        if (!initialized) {
            poiChecker = new PatternOfInterestChecker();
            poiChecker.loadPatternsOfInterest();
        }
        ArrayList arrayList = new ArrayList();
        if (set == null) {
            set = new HashSet();
        }
        boolean z = false;
        Set of = Set.of((Object[]) new String[]{"-", "‐", "‑", "‒", "–", "—", "︱", "﹘", "﹣", "－", "_", "\\", "/", "|"});
        CurrentFragment currentFragment = CurrentFragment.CURR_FRAGMENT_IS_NOT_STARTED;
        WhiteSpace whiteSpace = null;
        Term term = null;
        Punctuation punctuation = null;
        NonWord nonWord = null;
        int[] array = str.codePoints().toArray();
        for (int i = 0; i < array.length; i++) {
            String ch = Character.toString(array[i]);
            boolean z2 = false;
            if (currentFragment == CurrentFragment.CURR_FRAGMENT_IS_TERM) {
                int i2 = i + 1;
                while (true) {
                    if (i2 < array.length) {
                        int i3 = array[i2 - 1];
                        if (!Character.isWhitespace(array[i2])) {
                            i2++;
                        } else if (Character.isLetter(i3)) {
                            z2 = true;
                        }
                    }
                }
            }
            boolean z3 = false;
            boolean matches = Pattern.matches("[\\p{Punct}\\p{IsPunctuation}]", ch);
            boolean isEmoji = EmojiManager.isEmoji(ch);
            boolean isBlank = ch.isBlank();
            switch (currentFragment) {
                case CURR_FRAGMENT_IS_WHITE_SPACE:
                    if (isBlank) {
                        whiteSpace.addStringToOriginalForm(ch);
                        if (ch.equals("\n")) {
                            whiteSpace.setSentenceOrLineBreak(Boolean.TRUE.booleanValue());
                            break;
                        }
                    } else {
                        arrayList.add(whiteSpace);
                        z = false;
                        break;
                    }
                    break;
                case CURR_FRAGMENT_IS_TERM:
                    String originalForm = term.getOriginalForm();
                    if (isBlank) {
                        String repeatedCharacters = RepeatedCharactersRemover.repeatedCharacters(originalForm, set);
                        String flattenToAscii = TextCleaningOps.flattenToAscii(repeatedCharacters);
                        term.setCleanedForm(repeatedCharacters);
                        term.setCleanedAndStrippedForm(flattenToAscii);
                        PatternOfInterest returnsMatchOrNot = poiChecker.returnsMatchOrNot(term.getCleanedAndStrippedForm());
                        if (returnsMatchOrNot.getMatched().booleanValue()) {
                            nonWord = new NonWord();
                            nonWord.setIndexCardinal(term.getIndexCardinal());
                            nonWord.setIndexOrdinal(term.getIndexOrdinal());
                            nonWord.setOriginalForm(term.getOriginalForm());
                            nonWord.setTypeOfTextFragmentEnum(returnsMatchOrNot.getTypeOfTextFragmentEnum());
                            nonWord.setPoi(returnsMatchOrNot);
                            arrayList.add(nonWord);
                        } else {
                            arrayList.add(term);
                        }
                        z = false;
                        break;
                    } else if (!(!isEmoji) || !(!matches)) {
                        if (isEmoji) {
                            String repeatedCharacters2 = RepeatedCharactersRemover.repeatedCharacters(originalForm, set);
                            String flattenToAscii2 = TextCleaningOps.flattenToAscii(repeatedCharacters2);
                            term.setCleanedForm(repeatedCharacters2);
                            term.setCleanedAndStrippedForm(flattenToAscii2);
                            arrayList.add(term);
                            z = false;
                            break;
                        } else if (!matches || !z2 || of.contains(ch)) {
                            String repeatedCharacters3 = RepeatedCharactersRemover.repeatedCharacters(originalForm, set);
                            String flattenToAscii3 = TextCleaningOps.flattenToAscii(repeatedCharacters3);
                            term.setCleanedForm(repeatedCharacters3);
                            term.setCleanedAndStrippedForm(flattenToAscii3);
                            arrayList.add(term);
                            z = false;
                            break;
                        } else {
                            term.addStringToOriginalForm(ch);
                            break;
                        }
                    } else {
                        term.addStringToOriginalForm(ch);
                        break;
                    }
                    break;
                case CURR_FRAGMENT_IS_NON_WORD:
                    if (!isBlank && !isEmoji) {
                        String str2 = nonWord.getOriginalForm() + ch;
                        PatternOfInterest returnsMatchOrNot2 = poiChecker.returnsMatchOrNot(str2);
                        if (returnsMatchOrNot2.getMatched().booleanValue()) {
                            nonWord.setOriginalForm(str2);
                            nonWord.setPoi(returnsMatchOrNot2);
                            currentFragment = CurrentFragment.CURR_FRAGMENT_IS_NON_WORD;
                            break;
                        } else {
                            arrayList.add(nonWord);
                            z = false;
                            z3 = false;
                            nonWord = new NonWord();
                            break;
                        }
                    } else {
                        arrayList.add(nonWord);
                        z = false;
                        nonWord = new NonWord();
                        break;
                    }
                    break;
                case CURR_FRAGMENT_IS_PUNCTUATION:
                    PatternOfInterest returnsMatchOrNot3 = poiChecker.returnsMatchOrNot(punctuation.getOriginalForm());
                    if (returnsMatchOrNot3.getMatched().booleanValue()) {
                        nonWord = punctuation.toNonWord(returnsMatchOrNot3, punctuation.getOriginalForm());
                        arrayList.add(nonWord);
                        z = false;
                        break;
                    } else if (matches) {
                        String str3 = punctuation.getOriginalForm() + ch;
                        if (str3.codePoints().toArray().length > 1) {
                            PatternOfInterest returnsMatchOrNot4 = poiChecker.returnsMatchOrNot(str3);
                            if (returnsMatchOrNot4.getMatched().booleanValue()) {
                                nonWord = punctuation.toNonWord(returnsMatchOrNot4, str3);
                                currentFragment = CurrentFragment.CURR_FRAGMENT_IS_NON_WORD;
                                break;
                            } else {
                                punctuation.addStringToOriginalForm(ch);
                                break;
                            }
                        }
                    } else {
                        int[] array2 = punctuation.getOriginalForm().codePoints().toArray();
                        currentFragment = CurrentFragment.CURR_FRAGMENT_IS_PUNCTUATION;
                        for (int i4 : array2) {
                            String ch2 = Character.toString(i4);
                            punctuation = new Punctuation();
                            punctuation.setIndexCardinal(i);
                            punctuation.setIndexOrdinal(arrayList.size());
                            punctuation.addStringToOriginalForm(ch2);
                            arrayList.add(punctuation);
                        }
                        z = false;
                        break;
                    }
                    break;
            }
            if ((!z) & (!z3)) {
                if (isBlank) {
                    z = true;
                    whiteSpace = new WhiteSpace();
                    whiteSpace.setIndexCardinal(i);
                    whiteSpace.setIndexOrdinal(arrayList.size());
                    currentFragment = CurrentFragment.CURR_FRAGMENT_IS_WHITE_SPACE;
                    whiteSpace.addStringToOriginalForm(ch);
                    if (ch.equals("\n")) {
                        whiteSpace.setSentenceOrLineBreak(Boolean.TRUE.booleanValue());
                    }
                } else if ((!isEmoji) && (!matches)) {
                    z = true;
                    term = new Term();
                    currentFragment = CurrentFragment.CURR_FRAGMENT_IS_TERM;
                    term.setIndexCardinal(i);
                    term.setIndexOrdinal(arrayList.size());
                    term.addStringToOriginalForm(ch);
                } else if (matches) {
                    z = true;
                    punctuation = new Punctuation();
                    currentFragment = CurrentFragment.CURR_FRAGMENT_IS_PUNCTUATION;
                    punctuation.setIndexCardinal(i);
                    punctuation.setIndexOrdinal(arrayList.size());
                    punctuation.addStringToOriginalForm(ch);
                } else if (isEmoji) {
                    Emoji emoji = new Emoji();
                    emoji.setIndexCardinal(i);
                    emoji.setIndexOrdinal(arrayList.size());
                    emoji.addStringToOriginalForm(ch);
                    Optional emoji2 = EmojiManager.getEmoji(ch);
                    if (emoji2.isPresent()) {
                        List githubAliases = ((net.fellbaum.jemoji.Emoji) emoji2.get()).getGithubAliases();
                        if (githubAliases.isEmpty()) {
                            emoji.setSemiColonForm((String) ((net.fellbaum.jemoji.Emoji) emoji2.get()).getAllAliases().get(0));
                        } else {
                            emoji.setSemiColonForm((String) githubAliases.get(0));
                        }
                    }
                    arrayList.add(emoji);
                    z = false;
                    currentFragment = CurrentFragment.CURR_FRAGMENT_IS_NOT_STARTED;
                }
            }
            if (i + 1 == array.length) {
                if (currentFragment == CurrentFragment.CURR_FRAGMENT_IS_WHITE_SPACE) {
                    arrayList.add(whiteSpace);
                }
                if (currentFragment == CurrentFragment.CURR_FRAGMENT_IS_NON_WORD) {
                    arrayList.add(nonWord);
                }
                if (currentFragment == CurrentFragment.CURR_FRAGMENT_IS_TERM) {
                    String repeatedCharacters4 = RepeatedCharactersRemover.repeatedCharacters(term.getOriginalForm(), set);
                    String flattenToAscii4 = TextCleaningOps.flattenToAscii(repeatedCharacters4);
                    term.setCleanedForm(repeatedCharacters4);
                    term.setCleanedAndStrippedForm(flattenToAscii4);
                    arrayList.add(term);
                }
                if (currentFragment == CurrentFragment.CURR_FRAGMENT_IS_PUNCTUATION) {
                    PatternOfInterest returnsMatchOrNot5 = poiChecker.returnsMatchOrNot(punctuation.getOriginalForm());
                    if (returnsMatchOrNot5.getMatched().booleanValue()) {
                        nonWord = new NonWord();
                        nonWord.setIndexCardinal(punctuation.getIndexCardinal());
                        nonWord.setIndexOrdinal(punctuation.getIndexOrdinal());
                        nonWord.setOriginalForm(punctuation.getOriginalForm());
                        nonWord.setTypeOfTextFragmentEnum(returnsMatchOrNot5.getTypeOfTextFragmentEnum());
                        nonWord.setPoi(returnsMatchOrNot5);
                        arrayList.add(nonWord);
                    } else {
                        for (int i5 : punctuation.getOriginalForm().codePoints().toArray()) {
                            String ch3 = Character.toString(i5);
                            punctuation = new Punctuation();
                            punctuation.setIndexCardinal(i);
                            punctuation.setIndexOrdinal(arrayList.size());
                            punctuation.addStringToOriginalForm(ch3);
                            arrayList.add(punctuation);
                        }
                    }
                }
            }
        }
        return arrayList;
    }
}
