package nlp4j.wiki.converter;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.LinkedHashSet;
import nlp4j.wiki.BreakException;
import nlp4j.wiki.WikiDumpReader;
import nlp4j.wiki.WikiPage;
import nlp4j.wiki.WikiPageHandler;
import nlp4j.wiki.category.WikiCategoryIndexReader;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;

/* loaded from: input_file:nlp4j/wiki/converter/MediaWikiCsvConverter_V2.class */
public class MediaWikiCsvConverter_V2 {
    public static void main(String[] strArr) throws Exception {
        if (strArr == null || strArr.length != 6) {
            System.err.println("Usage:");
            System.err.println("args[0]: Input file path of Wikipedia/Wiktionary dump file in bz2 format");
            System.err.println("args[1]: Category Index File");
            System.err.println("args[2]: Category file");
            System.err.println("args[3]: Output file path of CSV");
            System.err.println("args[4]: Max count of process");
            System.err.println("args[5]: Category filter: set \"null\" when not needed");
            return;
        }
        String str = strArr[0];
        File file = new File(strArr[1]);
        File file2 = new File(strArr[2]);
        String str2 = strArr[3];
        String str3 = strArr[4];
        final String str4 = strArr[5].equals("null") ? null : strArr[5];
        final WikiCategoryIndexReader wikiCategoryIndexReader = new WikiCategoryIndexReader(file, file2);
        int i = -1;
        try {
            i = Integer.parseInt(str3);
        } catch (NumberFormatException e) {
        }
        final Integer valueOf = i > 0 ? Integer.valueOf(i) : null;
        File file3 = new File(str);
        if (!file3.exists()) {
            System.err.println("Not Found: " + file3.getAbsolutePath());
            return;
        }
        if (!file3.getName().endsWith(".bz2")) {
            System.err.println("Not .bz2: " + file3.getAbsolutePath());
            return;
        }
        PrintStream printStream = System.out;
        File file4 = new File(str2);
        if (file4.exists()) {
            throw new IOException("Output file already exists: " + file4.getAbsolutePath());
        }
        OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(file4, false), "UTF-8");
        ArrayList arrayList = new ArrayList();
        arrayList.add("title");
        arrayList.add("namespace");
        arrayList.add("id");
        arrayList.add("revision_id");
        arrayList.add("revision_parentid");
        arrayList.add("revision_timestamp");
        arrayList.add("is_redirect");
        arrayList.add("redirect_title");
        arrayList.add("categories");
        arrayList.add("categories_all");
        arrayList.add("categories_firstall");
        arrayList.add("templates");
        arrayList.add("text_length");
        arrayList.add("text_abstract");
        arrayList.add("text_all");
        try {
            WikiDumpReader wikiDumpReader = new WikiDumpReader(file3);
            try {
                final CSVPrinter cSVPrinter = new CSVPrinter(outputStreamWriter, CSVFormat.RFC4180.builder().setHeader((String[]) arrayList.toArray(new String[0])).build());
                try {
                    wikiDumpReader.read(new WikiPageHandler() { // from class: nlp4j.wiki.converter.MediaWikiCsvConverter_V2.1
                        int count = 0;

                        @Override // nlp4j.wiki.WikiPageHandler
                        public void read(WikiPage wikiPage) throws BreakException {
                            if (wikiPage.getTitle().contains(":")) {
                                return;
                            }
                            this.count++;
                            if (this.count % 1000 == 0) {
                                System.err.println(this.count);
                            }
                            if (valueOf != null && this.count > valueOf.intValue()) {
                                throw new BreakException();
                            }
                            String namespace = wikiPage.getNamespace();
                            String id = wikiPage.getId();
                            String revisionId = wikiPage.getRevisionId();
                            String parentId = wikiPage.getParentId();
                            String timestamp = wikiPage.getTimestamp();
                            String title = wikiPage.getTitle();
                            String str5 = "" + wikiPage.isRediect();
                            String rediect_title = wikiPage.isRediect() ? wikiPage.getRediect_title() : "";
                            String join = wikiPage.getCategoryTags() != null ? String.join(",", wikiPage.getCategoryTags()) : "";
                            String str6 = "";
                            if (wikiPage.getCategoryTags() != null) {
                                LinkedHashSet linkedHashSet = new LinkedHashSet();
                                for (String str7 : wikiPage.getCategoryTags()) {
                                    if (wikiCategoryIndexReader.getCategory(str7) != null && wikiCategoryIndexReader.getCategory(str7).getRootCategoriesAsList() != null) {
                                        for (String str8 : wikiCategoryIndexReader.getCategory(str7).getRootCategoriesAsList()) {
                                            if (!linkedHashSet.contains(str8)) {
                                                linkedHashSet.add(str8);
                                            }
                                        }
                                    }
                                }
                                if (str4 != null && !linkedHashSet.contains(str4)) {
                                    return;
                                } else {
                                    str6 = String.join(",", new ArrayList(linkedHashSet));
                                }
                            }
                            String str9 = "";
                            if (wikiPage.getCategoryTags() != null) {
                                LinkedHashSet linkedHashSet2 = new LinkedHashSet();
                                for (String str10 : wikiPage.getCategoryTags()) {
                                    if (wikiCategoryIndexReader.getCategory(str10) != null && wikiCategoryIndexReader.getCategory(str10).getFirstRootCategoriesAsList() != null) {
                                        for (String str11 : wikiCategoryIndexReader.getCategory(str10).getFirstRootCategoriesAsList()) {
                                            if (!linkedHashSet2.contains(str11)) {
                                                linkedHashSet2.add(str11);
                                            }
                                        }
                                    }
                                }
                                str9 = String.join(",", new ArrayList(linkedHashSet2));
                            }
                            String join2 = wikiPage.getTemplateTags() != null ? String.join(",", wikiPage.getTemplateTags()) : "";
                            String rootNodePlainText = wikiPage.getRootNodePlainText();
                            try {
                                cSVPrinter.printRecord(new Object[]{title, namespace, id, revisionId, parentId, timestamp, str5, rediect_title, join, str6, str9, join2, "" + rootNodePlainText.length(), rootNodePlainText, wikiPage.getPlainText()});
                            } catch (IOException e2) {
                                System.err.println("error on: " + this.count);
                                e2.printStackTrace();
                                throw new BreakException();
                            }
                        }
                    });
                    cSVPrinter.close();
                    wikiDumpReader.close();
                } catch (Throwable th) {
                    try {
                        cSVPrinter.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                    throw th;
                }
            } finally {
            }
        } catch (BreakException e2) {
            System.err.println("OK");
        }
    }
}
