package org.carrot2.text.linguistic;

import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import java.io.StringReader;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.Iterator;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.DefaultLanguageModelFactoryDescriptor;
import org.carrot2.text.linguistic.lucene.ArabicStemmerFactory;
import org.carrot2.text.linguistic.lucene.ChineseSimplifiedTokenizerFactory;
import org.carrot2.text.linguistic.lucene.SnowballStemmerFactory;
import org.carrot2.text.linguistic.morfologik.PolishStemmerFactory;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.resource.ResourceUtils;
import org.carrot2.util.resource.ResourceUtilsFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Bindable(prefix = "DefaultLanguageModelFactory")
/* loaded from: input_file:org/carrot2/text/linguistic/DefaultLanguageModelFactory.class */
public class DefaultLanguageModelFactory implements ILanguageModelFactory {
    private static LexicalResources LEXICAL_RESOURCES_MERGED;
    private static final Logger logger = LoggerFactory.getLogger(DefaultLanguageModel.class);
    private static final EnumMap<LanguageCode, IStemmerFactory> stemmerFactories = createDefaultStemmers();
    private static final EnumMap<LanguageCode, ITokenizerFactory> tokenizerFactories = createDefaultTokenizers();
    private static final HashMap<LanguageCode, LexicalResources> LEXICAL_RESOURCES_CACHE = Maps.newHashMap();

    @Input
    @Init
    @Attribute(key = DefaultLanguageModelFactoryDescriptor.Keys.RESOURCE_PATH)
    public String resourcePath = "/";

    @Input
    @Attribute
    @Processing
    public boolean reloadResources = false;

    @Init
    @Processing
    @Input
    @Attribute
    public boolean mergeResources = true;
    private final HashMap<LanguageCode, IStemmer> stemmerCache = Maps.newHashMap();
    private final HashMap<LanguageCode, ITokenizer> tokenizerCache = Maps.newHashMap();

    @Override // org.carrot2.text.linguistic.ILanguageModelFactory
    public final ILanguageModel getLanguageModel(LanguageCode languageCode) {
        IStemmer iStemmer;
        ITokenizer iTokenizer;
        synchronized (DefaultLanguageModelFactory.class) {
            if (this.reloadResources || !LEXICAL_RESOURCES_CACHE.containsKey(languageCode) || (this.mergeResources && LEXICAL_RESOURCES_MERGED == null)) {
                ResourceUtils defaultResourceUtils = ResourceUtilsFactory.getDefaultResourceUtils();
                if (this.mergeResources) {
                    for (LanguageCode languageCode2 : LanguageCode.values()) {
                        if (!LEXICAL_RESOURCES_CACHE.containsKey(languageCode2) || this.reloadResources) {
                            LEXICAL_RESOURCES_CACHE.put(languageCode2, LexicalResources.load(defaultResourceUtils, languageCode2, this.resourcePath));
                        }
                    }
                    LEXICAL_RESOURCES_MERGED = LexicalResources.merge(LEXICAL_RESOURCES_CACHE.values());
                } else {
                    LEXICAL_RESOURCES_CACHE.put(languageCode, LexicalResources.load(defaultResourceUtils, languageCode, this.resourcePath));
                }
            }
        }
        LexicalResources lexicalResources = this.mergeResources ? LEXICAL_RESOURCES_MERGED : LEXICAL_RESOURCES_CACHE.get(languageCode);
        synchronized (this.stemmerCache) {
            iStemmer = this.stemmerCache.get(languageCode);
            if (!this.stemmerCache.containsKey(languageCode)) {
                iStemmer = createStemmer(languageCode);
                this.stemmerCache.put(languageCode, iStemmer);
            }
        }
        synchronized (this.tokenizerCache) {
            iTokenizer = this.tokenizerCache.get(languageCode);
            if (!this.tokenizerCache.containsKey(languageCode)) {
                iTokenizer = createTokenizer(languageCode);
                this.tokenizerCache.put(languageCode, iTokenizer);
            }
        }
        return new DefaultLanguageModel(languageCode, lexicalResources, iStemmer, iTokenizer);
    }

    protected IStemmer createStemmer(LanguageCode languageCode) {
        IStemmerFactory iStemmerFactory = stemmerFactories.get(languageCode);
        return iStemmerFactory != null ? iStemmerFactory.createInstance() : IdentityStemmer.INSTANCE;
    }

    protected ITokenizer createTokenizer(LanguageCode languageCode) {
        return tokenizerFactories.get(languageCode).createInstance();
    }

    private static EnumMap<LanguageCode, IStemmerFactory> createDefaultStemmers() {
        EnumMap<LanguageCode, IStemmerFactory> newEnumMap = Maps.newEnumMap(LanguageCode.class);
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.POLISH, (LanguageCode) new PolishStemmerFactory());
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.ARABIC, (LanguageCode) new ArabicStemmerFactory());
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.DANISH, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.DanishStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.DUTCH, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.DutchStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.ENGLISH, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.EnglishStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.FINNISH, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.FinnishStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.FRENCH, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.FrenchStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.GERMAN, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.GermanStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.HUNGARIAN, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.HungarianStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.ITALIAN, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.ItalianStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.NORWEGIAN, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.NorwegianStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.PORTUGUESE, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.PortugueseStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.ROMANIAN, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.RomanianStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.RUSSIAN, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.RussianStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.SPANISH, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.SpanishStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.SWEDISH, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.SwedishStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.TURKISH, (LanguageCode) new SnowballStemmerFactory("org.tartarus.snowball.ext.TurkishStemmer"));
        newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) LanguageCode.CHINESE_SIMPLIFIED, (LanguageCode) new IdentityStemmerFactory());
        Iterator it = Sets.newTreeSet(newEnumMap.keySet()).iterator();
        while (it.hasNext()) {
            LanguageCode languageCode = (LanguageCode) it.next();
            try {
                newEnumMap.get(languageCode).createInstance().stem("test");
            } catch (Throwable th) {
                newEnumMap.put((EnumMap<LanguageCode, IStemmerFactory>) languageCode, (LanguageCode) new IdentityStemmerFactory());
                logger.warn("Stemmer for " + languageCode.toString() + " (" + languageCode.getIsoCode() + ") is not available. This may degrade clustering quality of " + languageCode.toString() + " content.");
            }
        }
        return newEnumMap;
    }

    private static EnumMap<LanguageCode, ITokenizerFactory> createDefaultTokenizers() {
        EnumMap<LanguageCode, ITokenizerFactory> newEnumMap = Maps.newEnumMap(LanguageCode.class);
        ExtendedWhitespaceTokenizerFactory extendedWhitespaceTokenizerFactory = new ExtendedWhitespaceTokenizerFactory();
        for (LanguageCode languageCode : LanguageCode.values()) {
            newEnumMap.put((EnumMap<LanguageCode, ITokenizerFactory>) languageCode, (LanguageCode) extendedWhitespaceTokenizerFactory);
        }
        newEnumMap.put((EnumMap<LanguageCode, ITokenizerFactory>) LanguageCode.CHINESE_SIMPLIFIED, (LanguageCode) new ChineseSimplifiedTokenizerFactory());
        Iterator it = Sets.newTreeSet(newEnumMap.keySet()).iterator();
        while (it.hasNext()) {
            LanguageCode languageCode2 = (LanguageCode) it.next();
            try {
                newEnumMap.get(languageCode2).createInstance().reset(new StringReader("test"));
            } catch (Throwable th) {
                newEnumMap.put((EnumMap<LanguageCode, ITokenizerFactory>) languageCode2, (LanguageCode) new ExtendedWhitespaceTokenizerFactory());
                logger.warn("Tokenizer for " + languageCode2.toString() + " (" + languageCode2.getIsoCode() + ") is not available. This may degrade clustering quality of " + languageCode2.toString() + " content.");
            }
        }
        return newEnumMap;
    }
}
