package org.wikibrain.lucene.tokenizers;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import org.wikibrain.core.lang.Language;
import org.wikibrain.lucene.LuceneOptions;
import org.wikibrain.lucene.TokenizerOptions;

/* loaded from: input_file:org/wikibrain/lucene/tokenizers/LanguageTokenizer.class */
public abstract class LanguageTokenizer {
    private static final String STOP_WORDS = "src/main/resources/stopwords/";
    private static Map<Language, Class> tokenizerClasses = new HashMap();
    protected final Version matchVersion;
    protected final boolean caseInsensitive;
    protected final boolean useStopWords;
    protected final boolean useStem;
    protected final Language language;

    /* JADX INFO: Access modifiers changed from: protected */
    public LanguageTokenizer(Version version, TokenizerOptions tokenizerOptions, Language language) {
        this.matchVersion = version;
        this.caseInsensitive = tokenizerOptions.isCaseInsensitive();
        this.useStopWords = tokenizerOptions.doesUseStopWords();
        this.useStem = tokenizerOptions.doesUseStem();
        this.language = language;
    }

    public abstract TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet charArraySet);

    public Tokenizer makeTokenizer(Reader reader) {
        return new StandardTokenizer(this.matchVersion, reader);
    }

    public TokenStream getTokenStream(Reader reader) {
        return getTokenStream(makeTokenizer(reader), CharArraySet.EMPTY_SET);
    }

    public TokenizerOptions getTokenizerOptions() {
        TokenizerOptions tokenizerOptions = new TokenizerOptions();
        if (this.caseInsensitive) {
            tokenizerOptions.caseInsensitive();
        }
        if (this.useStopWords) {
            tokenizerOptions.useStopWords();
        }
        if (this.useStem) {
            tokenizerOptions.useStem();
        }
        return tokenizerOptions;
    }

    public Language getLanguage() {
        return this.language;
    }

    public static LanguageTokenizer getLanguageTokenizer(Language language, LuceneOptions luceneOptions) {
        try {
            if (language.equals(Language.getByLangCode("simple"))) {
                language = Language.getByLangCode("en");
            }
            return tokenizerClasses.containsKey(language) ? (LanguageTokenizer) tokenizerClasses.get(language).getDeclaredConstructor(Version.class, TokenizerOptions.class, Language.class).newInstance(luceneOptions.matchVersion, luceneOptions.options, language) : new DefaultTokenizer(luceneOptions.matchVersion, luceneOptions.options, language);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    public static LanguageTokenizer getLanguageTokenizer(Language language, TokenizerOptions tokenizerOptions, Version version) {
        try {
            if (language.equals(Language.getByLangCode("simple"))) {
                language = Language.getByLangCode("en");
            }
            return tokenizerClasses.containsKey(language) ? (LanguageTokenizer) tokenizerClasses.get(language).getDeclaredConstructor(Version.class, TokenizerOptions.class, Language.class).newInstance(version, tokenizerOptions, language) : new DefaultTokenizer(version, tokenizerOptions, language);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static CharArraySet getStopWordsForNonLuceneLangFromFile(Version version, Language language) {
        try {
            String str = STOP_WORDS + language.getLangCode() + ".txt";
            CharArraySet charArraySet = new CharArraySet(version, 0, false);
            if (new File(str).exists()) {
                Iterator it = IOUtils.readLines(FileUtils.openInputStream(new File(str))).iterator();
                while (it.hasNext()) {
                    charArraySet.add((String) it.next());
                }
            }
            return charArraySet;
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    static {
        tokenizerClasses.put(Language.getByLangCode("en"), EnglishTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("de"), GermanTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("fr"), FrenchTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("nl"), DutchTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("it"), ItalianTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("pl"), PolishTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("es"), SpanishTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("ru"), RussianTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("ja"), JapaneseTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("pt"), PortugueseTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("zh"), ChineseTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("sv"), SwedishTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("uk"), UkrainianTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("ca"), CatalanTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("no"), NorwegianTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("fi"), FinnishTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("cs"), CzechTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("hu"), HungarianTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("ko"), KoreanTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("id"), IndonesianTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("tr"), TurkishTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("ro"), RomanianTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("sk"), SlovakTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("da"), DanishTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("he"), HebrewTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("lad"), LadinoTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("ar"), ArabicTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("bg"), BulgarianTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("el"), GreekTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("eu"), BasqueTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("ga"), IrishTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("gl"), GalicianTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("hi"), HindiTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("hy"), ArmenianTokenizer.class);
        tokenizerClasses.put(Language.getByLangCode("lv"), LatvianTokenizer.class);
    }
}
