package org.culturegraph.mf.mediawiki;

import de.fau.cs.osr.ptk.common.ast.AstNode;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Collection;
import java.util.Collections;
import java.util.Deque;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.culturegraph.mf.framework.MetafactureException;
import org.culturegraph.mf.framework.ObjectReceiver;
import org.culturegraph.mf.framework.XmlReceiver;
import org.culturegraph.mf.framework.annotations.Description;
import org.culturegraph.mf.framework.annotations.In;
import org.culturegraph.mf.framework.annotations.Out;
import org.culturegraph.mf.framework.helpers.DefaultXmlPipe;
import org.culturegraph.mf.mediawiki.objects.WikiPage;
import org.sweble.wikitext.lazy.utils.StringConverter;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

@Description("Extracts wiki pages from a media wiki xml dump.")
@In(XmlReceiver.class)
@Out(WikiPage.class)
/* loaded from: input_file:org/culturegraph/mf/mediawiki/WikiXmlHandler.class */
public final class WikiXmlHandler extends DefaultXmlPipe<ObjectReceiver<WikiPage>> {
    private static final String ENCODING = "UTF-8";
    private final Deque<Tag> stack = new LinkedList();
    private final StringBuilder charData = new StringBuilder();
    private final Set<Integer> includeNamespaceIds = new HashSet();
    private Matcher includePagesMatcher;
    private String baseURL;
    private WikiPage currentPage;
    private boolean includePage;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/culturegraph/mf/mediawiki/WikiXmlHandler$Tag.class */
    public enum Tag {
        BASE,
        PAGE,
        TITLE,
        NS,
        ID,
        REVISION,
        TEXT,
        OTHER
    }

    public Set<Integer> getIncludeNamespaceIds() {
        return Collections.unmodifiableSet(this.includeNamespaceIds);
    }

    public void setIncludeNamespacesIds(Collection<Integer> collection) {
        this.includeNamespaceIds.clear();
        this.includeNamespaceIds.addAll(collection);
    }

    public void setIncludeNamespaceIds(String str) {
        String[] split = str.split("(,|\\s)");
        this.includeNamespaceIds.clear();
        for (String str2 : split) {
            this.includeNamespaceIds.add(Integer.valueOf(Integer.parseInt(str2.trim())));
        }
    }

    public String getIncludePagesPattern() {
        if (this.includePagesMatcher == null) {
            return null;
        }
        return this.includePagesMatcher.pattern().pattern();
    }

    public void setIncludePagesPattern(String str) {
        if (str == null) {
            this.includePagesMatcher = null;
        } else {
            this.includePagesMatcher = Pattern.compile(str).matcher(StringUtils.EMPTY);
        }
    }

    public void startDocument() {
        this.includePage = false;
        this.baseURL = null;
        this.stack.clear();
    }

    public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
        Tag localNameToTag = localNameToTag(str2);
        switch (AnonymousClass1.$SwitchMap$org$culturegraph$mf$mediawiki$WikiXmlHandler$Tag[localNameToTag.ordinal()]) {
            case 1:
                this.currentPage = new WikiPage();
                this.includePage = true;
                break;
            case 2:
            case 3:
            case StringConverter.FAIL_ON_UNRESOLVED_XML_ENTITY /* 4 */:
            case AstNode.NT_PARSER_ENTITY /* 5 */:
            case 6:
                this.charData.delete(0, this.charData.length());
                break;
        }
        this.stack.push(localNameToTag);
    }

    public void endElement(String str, String str2, String str3) throws SAXException {
        Tag localNameToTag = localNameToTag(str2);
        this.stack.pop();
        switch (AnonymousClass1.$SwitchMap$org$culturegraph$mf$mediawiki$WikiXmlHandler$Tag[localNameToTag.ordinal()]) {
            case 1:
                if (this.includePage) {
                    getReceiver().process(this.currentPage);
                }
                this.includePage = false;
                return;
            case 2:
                this.baseURL = extractBaseURL(this.charData.toString());
                return;
            case 3:
                handleTitleEndElement();
                return;
            case StringConverter.FAIL_ON_UNRESOLVED_XML_ENTITY /* 4 */:
                handleNSEndElement();
                return;
            case AstNode.NT_PARSER_ENTITY /* 5 */:
                handleIdEndElement();
                return;
            case 6:
                if (this.includePage) {
                    this.currentPage.setWikiText(this.charData.toString());
                    return;
                }
                return;
            default:
                return;
        }
    }

    public void characters(char[] cArr, int i, int i2) throws SAXException {
        if (this.includePage || this.stack.peek() == Tag.BASE) {
            this.charData.append(cArr, i, i2);
        }
    }

    private void handleTitleEndElement() {
        if (this.includePage) {
            this.currentPage.setTitle(this.charData.toString());
            this.currentPage.setUrl(urlFromTitle(this.currentPage.getTitle()));
            if (this.includePagesMatcher != null) {
                this.includePagesMatcher.reset(this.currentPage.getTitle());
                this.includePage = this.includePagesMatcher.matches();
            }
        }
    }

    private void handleNSEndElement() {
        if (this.includePage) {
            int parseInt = Integer.parseInt(this.charData.toString().trim());
            this.currentPage.setNamespaceId(parseInt);
            if (this.includeNamespaceIds.isEmpty()) {
                return;
            }
            this.includePage = this.includeNamespaceIds.contains(Integer.valueOf(parseInt));
        }
    }

    private void handleIdEndElement() {
        if (this.includePage) {
            switch (this.stack.peek()) {
                case PAGE:
                    this.currentPage.setPageId(Long.parseLong(this.charData.toString().trim()));
                    return;
                case REVISION:
                    this.currentPage.setRevisionId(Long.parseLong(this.charData.toString().trim()));
                    return;
                default:
                    return;
            }
        }
    }

    private Tag localNameToTag(String str) {
        try {
            return Tag.valueOf(str.toUpperCase(Locale.ROOT));
        } catch (IllegalArgumentException e) {
            return Tag.OTHER;
        }
    }

    private String extractBaseURL(String str) {
        String trim = str.trim();
        if (trim.isEmpty()) {
            return null;
        }
        return trim.replaceAll("/[^/]*$", "/");
    }

    private String urlFromTitle(String str) {
        if (this.baseURL == null) {
            return null;
        }
        try {
            return this.baseURL + URLEncoder.encode(str.trim().replace(" ", "_"), ENCODING);
        } catch (UnsupportedEncodingException e) {
            throw new MetafactureException(e);
        }
    }
}
