package org.opensextant.extractors.geo.rules;

import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.opensextant.data.Place;
import org.opensextant.extractors.geo.PlaceCandidate;
import org.opensextant.extractors.geo.ScoredPlace;
import org.opensextant.util.TextUtils;

/* loaded from: input_file:org/opensextant/extractors/geo/rules/NonsenseFilter.class */
public class NonsenseFilter extends GeocodeRule {
    private static final int MAX_NONSENSE_PHRASE_LEN = 20;
    private static final int MIN_PHONETIC_MATCH_LEN = 4;
    public static final int AV = 10;
    private static final Set<String> TRIVIAL_ARTICLES = new HashSet();
    static final Pattern anyValidAbbrev;
    static final Pattern nonAbbrevPunct;
    static final Pattern trivialNumerics;
    static final Pattern anyPunct;
    static final Pattern commonPunct;

    private static boolean lowerInitial(String str) {
        return Character.isLowerCase(str.charAt(0));
    }

    public static boolean isValidAbbreviation(String str) {
        return !nonAbbrevPunct.matcher(str).find();
    }

    @Override // org.opensextant.extractors.geo.rules.GeocodeRule
    public void evaluate(List<PlaceCandidate> list) {
        for (PlaceCandidate placeCandidate : list) {
            if (!placeCandidate.isValid() && placeCandidate.getTokens() != null && placeCandidate.getLength() <= 20 && !assessPunctuation(placeCandidate)) {
                if (irregularCase(placeCandidate.getText())) {
                    placeCandidate.setFilteredOut(true);
                } else {
                    if (placeCandidate.getWordCount() == 2 && !placeCandidate.isCountry) {
                        String str = placeCandidate.getTokens()[0];
                        String str2 = placeCandidate.getTokens()[1];
                        if ((placeCandidate.isLower() || (lowerInitial(str) && !lowerInitial(str2))) && TRIVIAL_ARTICLES.contains(str)) {
                            placeCandidate.setFilteredOut(true);
                        }
                    }
                    if (isShort(placeCandidate.getLength()) && trivialNumerics.matcher(placeCandidate.getText()).matches()) {
                        placeCandidate.setFilteredOut(true);
                        placeCandidate.addRule("Nonsense Numbers");
                    } else {
                        if (placeCandidate.isLower() || lowerInitial(placeCandidate.getText())) {
                            HashSet hashSet = new HashSet();
                            String[] tokens = placeCandidate.getTokens();
                            int length = tokens.length;
                            int i = 0;
                            while (true) {
                                if (i >= length) {
                                    break;
                                }
                                String lowerCase = tokens[i].toLowerCase();
                                if (hashSet.contains(lowerCase)) {
                                    placeCandidate.setFilteredOut(true);
                                    placeCandidate.addRule("Nonsense Repeated-Lower");
                                    break;
                                } else {
                                    hashSet.add(lowerCase);
                                    i++;
                                }
                            }
                        }
                        if (!placeCandidate.isFilteredOut() && isShort(placeCandidate.getLength())) {
                            assessPhoneticMatch(placeCandidate);
                        }
                    }
                }
            }
        }
    }

    public static boolean assessPunctuation(PlaceCandidate placeCandidate) {
        int i = 0;
        while (anyPunct.matcher(placeCandidate.getText()).find()) {
            i++;
        }
        if (i == 0) {
            return false;
        }
        if (isValidAbbreviation(placeCandidate.getText())) {
            placeCandidate.addRule("Valid Punct");
            return true;
        }
        if (shortNumericText(placeCandidate.getText())) {
            placeCandidate.setFilteredOut(true);
            return true;
        }
        if (irregularCommonPunct(placeCandidate.getText())) {
            placeCandidate.setFilteredOut(true);
            return true;
        }
        boolean isIrregularPunct = isIrregularPunct(i, placeCandidate.getLength());
        boolean regularAbbreviationPatterns = regularAbbreviationPatterns(placeCandidate.getText());
        if (!regularAbbreviationPatterns && isIrregularPunct) {
            placeCandidate.setFilteredOut(true);
            placeCandidate.addRule("Nonsense Punct");
            return true;
        }
        if (isIrregularPunct) {
            placeCandidate.setFilteredOut(true);
            placeCandidate.addRule("Nonsense Punct");
            return true;
        }
        if (!regularAbbreviationPatterns) {
            return false;
        }
        placeCandidate.addRule("Normal Abbrev");
        return true;
    }

    public void assessPhoneticMatch(PlaceCandidate placeCandidate) {
        boolean z = false;
        String nDTextnorm = placeCandidate.getNDTextnorm();
        String str = null;
        this.log.debug("Testing phrase {} phonetic:{}", placeCandidate.getTextnorm(), nDTextnorm);
        Iterator<ScoredPlace> it = placeCandidate.getPlaces().iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            Place place = it.next().getPlace();
            this.log.debug("\tPLACE={}, {}", place, place.getNamenorm());
            boolean hasDiacritics = TextUtils.hasDiacritics(place.getPlaceName());
            if (!hasDiacritics || !placeCandidate.hasDiacritics || !place.getName().equalsIgnoreCase(placeCandidate.getText())) {
                if (!hasDiacritics && !placeCandidate.hasDiacritics) {
                    z = true;
                    break;
                }
                if (placeCandidate.getLength() > MIN_PHONETIC_MATCH_LEN) {
                    if (place.getNamenorm().contains(placeCandidate.getTextnorm())) {
                        z = true;
                        str = "Location-Contains-Name";
                        break;
                    } else if (nDTextnorm.equals(place.getNDNamenorm())) {
                        z = true;
                        str = "Matched-Phonetic";
                        break;
                    }
                }
                this.log.debug("\t{} !~ {}", placeCandidate.getText(), place.getNamenorm());
            } else {
                z = true;
                str = "Matched-Diacritics";
                break;
            }
        }
        if (!z) {
            placeCandidate.setFilteredOut(true);
            placeCandidate.addRule("Nonsense,Mismatched,Diacritic");
        } else if (str != null) {
            placeCandidate.addRule(str);
        }
    }

    public boolean irregularCase(String str) {
        if (str.length() < 3) {
            return false;
        }
        return Character.isUpperCase(str.charAt(0)) && Character.isUpperCase(str.charAt(1)) && Character.isLowerCase(str.charAt(str.length() - 1));
    }

    public static boolean shortNumericText(String str) {
        return isShort(str.length()) && Character.isDigit(str.charAt(0));
    }

    public static boolean irregularCommonPunct(String str) {
        return commonPunct.matcher(str).find();
    }

    public static boolean isIrregularPunct(int i, int i2) {
        return isIrregularPunct(i, i2, 5);
    }

    public static boolean isIrregularPunct(int i, int i2, int i3) {
        return i != 0 && i2 / i < i3;
    }

    public static boolean regularAbbreviationPatterns(String str) {
        return anyValidAbbrev.matcher(str).find();
    }

    @Override // org.opensextant.extractors.geo.rules.GeocodeRule
    public void evaluate(PlaceCandidate placeCandidate, Place place) {
    }

    static {
        TRIVIAL_ARTICLES.add("the");
        TRIVIAL_ARTICLES.add("a");
        TRIVIAL_ARTICLES.add("an");
        TRIVIAL_ARTICLES.add("le");
        anyValidAbbrev = Pattern.compile("[EFMNSW][A-Z]{0,2}\\.\\s+", 2);
        nonAbbrevPunct = Pattern.compile("[^\\w\\s.-]+");
        trivialNumerics = Pattern.compile("\\w+[\\p{Punct}\\s]+\\d+");
        anyPunct = Pattern.compile("[\\p{Punct}—―“”″]");
        commonPunct = Pattern.compile("[()\\[\\]!&$]");
    }
}
