package org.opensextant.extractors.geo.rules;

import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.opensextant.data.Place;
import org.opensextant.extractors.geo.PlaceCandidate;
import org.opensextant.extractors.geo.ScoredPlace;
import org.opensextant.util.TextUtils;

/* loaded from: input_file:org/opensextant/extractors/geo/rules/NonsenseFilter.class */
public class NonsenseFilter extends GeocodeRule {
    public static final int GENERIC_ONE_WORD = 10;
    private static final Pattern anyValidAbbrev;
    private static final Pattern dotAbbrev;
    static Pattern validAbbrev;
    static Pattern trivialNumerics;
    static Pattern anyInvalidPunct;
    static Pattern anyPunctuation;
    private static int MAX_NONSENSE_PHRASE_LEN = 20;
    private static int MIN_PHONETIC_MATCH_LEN = 4;
    private static Pattern wsRedux = Pattern.compile("[-\\s+`]");
    private static Set<String> TRIVIAL_ARTICLES = new HashSet();

    protected static final String phoneticRedux(String str) {
        return wsRedux.matcher(str).replaceAll("");
    }

    private static boolean lowerInitial(String str) {
        return Character.isLowerCase(str.charAt(0));
    }

    private static boolean isValidAbbreviation(String str) {
        return TextUtils.isASCII(dotAbbrev.matcher(str).replaceAll(""));
    }

    protected static final boolean isPhoneticMatch(String str, String str2) {
        return phoneticRedux(str2).equalsIgnoreCase(str);
    }

    @Override // org.opensextant.extractors.geo.rules.GeocodeRule
    public void evaluate(List<PlaceCandidate> list) {
        for (PlaceCandidate placeCandidate : list) {
            if (!placeCandidate.isValid() && placeCandidate.getTokens() != null && placeCandidate.getLength() <= MAX_NONSENSE_PHRASE_LEN && !assessPunctuation(placeCandidate)) {
                if (placeCandidate.getWordCount() == 2) {
                    String str = placeCandidate.getTokens()[0];
                    String str2 = placeCandidate.getTokens()[1];
                    if ((placeCandidate.isLower() || (lowerInitial(str) && !lowerInitial(str2))) && TRIVIAL_ARTICLES.contains(str)) {
                        placeCandidate.setFilteredOut(true);
                    }
                }
                if (placeCandidate.getLength() >= 10 || !trivialNumerics.matcher(placeCandidate.getText()).matches()) {
                    if (placeCandidate.isLower() || lowerInitial(placeCandidate.getText())) {
                        HashSet hashSet = new HashSet();
                        String[] tokens = placeCandidate.getTokens();
                        int length = tokens.length;
                        int i = 0;
                        while (true) {
                            if (i >= length) {
                                break;
                            }
                            String lowerCase = tokens[i].toLowerCase();
                            if (hashSet.contains(lowerCase)) {
                                placeCandidate.setFilteredOut(true);
                                placeCandidate.addRule("Nonsense Repeated-Lower");
                                break;
                            } else {
                                hashSet.add(lowerCase);
                                i++;
                            }
                        }
                    }
                    if (!placeCandidate.isFilteredOut() && placeCandidate.getLength() <= 10) {
                        assessPhoneticMatch(placeCandidate);
                    }
                } else {
                    placeCandidate.setFilteredOut(true);
                    placeCandidate.addRule("Nonsense Numbers");
                }
            }
        }
    }

    public static boolean assessPunctuation(PlaceCandidate placeCandidate) {
        int i = 0;
        while (anyPunctuation.matcher(placeCandidate.getText()).find()) {
            i++;
        }
        if (i == 0) {
            return false;
        }
        boolean isIrregularPunct = isIrregularPunct(i, placeCandidate.getLength());
        if (isValidAbbreviation(placeCandidate.getText()) && !isIrregularPunct) {
            placeCandidate.addRule("Valid Punct");
            return true;
        }
        if (!regularAbbreviationPatterns(placeCandidate.getText()) && isIrregularPunct) {
            placeCandidate.setFilteredOut(true);
            placeCandidate.addRule("Nonsense Punct");
            return true;
        }
        if (!isIrregularPunct) {
            return false;
        }
        placeCandidate.setFilteredOut(true);
        placeCandidate.addRule("Nonsense Punct");
        return true;
    }

    public void assessPhoneticMatch(PlaceCandidate placeCandidate) {
        boolean z = false;
        String phoneticRedux = phoneticRedux(placeCandidate.getTextnorm());
        String str = null;
        this.log.debug("Testing phrase {} phonetic:{}", placeCandidate.getTextnorm(), phoneticRedux);
        Iterator<ScoredPlace> it = placeCandidate.getPlaces().iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            ScoredPlace next = it.next();
            this.log.debug("\tPLACE={}, {}", next, next.getNamenorm());
            boolean hasDiacritics = TextUtils.hasDiacritics(next.getPlaceName());
            if (!hasDiacritics || !placeCandidate.hasDiacritics || !next.getName().equalsIgnoreCase(placeCandidate.getText())) {
                if (!hasDiacritics && !placeCandidate.hasDiacritics) {
                    z = true;
                    break;
                }
                if (placeCandidate.getLength() > MIN_PHONETIC_MATCH_LEN) {
                    if (next.getNamenorm().contains(placeCandidate.getTextnorm())) {
                        z = true;
                        str = "Location-Contains-Name";
                        break;
                    } else if (isPhoneticMatch(phoneticRedux, next.getNamenorm())) {
                        z = true;
                        str = "Matched-Phonetic";
                        break;
                    }
                }
                this.log.debug("\t{} !~ {}", placeCandidate.getText(), next.getNamenorm());
            } else {
                z = true;
                str = "Matched-Diacritics";
                break;
            }
        }
        if (!z) {
            placeCandidate.setFilteredOut(true);
            placeCandidate.addRule("Nonsense,Mismatched,Diacritic");
        } else if (str != null) {
            placeCandidate.addRule(str);
        }
    }

    public static boolean irregularPunctCount(String str) {
        return irregularPunctCount(str, 5);
    }

    public static boolean isIrregularPunct(int i, int i2) {
        return isIrregularPunct(i, i2, 5);
    }

    public static boolean isIrregularPunct(int i, int i2, int i3) {
        return i != 0 && i2 / i < i3;
    }

    public static boolean irregularPunctCount(String str, int i) {
        int i2 = 0;
        while (anyPunctuation.matcher(str).find()) {
            i2++;
        }
        return i2 != 0 && TextUtils.delete_whitespace(str).length() / i2 < i;
    }

    public static boolean irregularPunctPatterns(String str) {
        return anyInvalidPunct.matcher(str).find();
    }

    public static boolean regularAbbreviationPatterns(String str) {
        return anyValidAbbrev.matcher(str).find();
    }

    public static int[] irregularPunct(String str) {
        int i = 0;
        int i2 = 0;
        char c = 0;
        for (char c2 : str.toCharArray()) {
            if (Character.isWhitespace(c2)) {
                i2++;
            }
            if ((Character.isWhitespace(c2) || !Character.isLetterOrDigit(c2)) && !Character.isLetterOrDigit(c) && c != 0) {
                i++;
            }
            c = c2;
        }
        return new int[]{i2, i};
    }

    @Override // org.opensextant.extractors.geo.rules.GeocodeRule
    public void evaluate(PlaceCandidate placeCandidate, Place place) {
    }

    static {
        TRIVIAL_ARTICLES.add("the");
        TRIVIAL_ARTICLES.add("a");
        TRIVIAL_ARTICLES.add("an");
        TRIVIAL_ARTICLES.add("le");
        anyValidAbbrev = Pattern.compile("[EFMNSW][A-Z]{0,2}\\.\\s+", 2);
        dotAbbrev = Pattern.compile("\\.\\s*");
        validAbbrev = Pattern.compile("\\w+[.] \\S+");
        trivialNumerics = Pattern.compile("\\w+[\\p{Punct}\\s]+\\d+");
        anyInvalidPunct = Pattern.compile("[[\\p{Punct}—―“”″]&&[^-_.'`]]+");
        anyPunctuation = Pattern.compile("([\\p{Punct}—―“”″]{1})");
    }
}
