package nlp4j.crawler;

import com.google.gson.JsonArray;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.lang.invoke.MethodHandles;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import nlp4j.Document;
import nlp4j.impl.DefaultDocument;
import nlp4j.util.UnicodeUtils;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/* loaded from: input_file:nlp4j/crawler/CsvFileCrawler.class */
public class CsvFileCrawler extends AbstractFileCrawler implements Crawler {
    private static final Logger logger = LogManager.getLogger(MethodHandles.lookup().lookupClass());
    private boolean add_header = false;
    private boolean add_data = false;

    public CsvFileCrawler() {
        this.prop.setProperty("target", "text");
    }

    @Override // nlp4j.crawler.AbstractCrawler, nlp4j.crawler.Crawler
    public List<Document> crawlDocuments() {
        ArrayList arrayList = new ArrayList();
        if (this.prop.getProperty("target") == null) {
            logger.warn("target is not set.");
            return arrayList;
        }
        Iterator<File> it = this.files.iterator();
        while (it.hasNext()) {
            try {
                FileInputStream fileInputStream = new FileInputStream(it.next());
                try {
                    arrayList.addAll(parseDocuments(fileInputStream));
                    fileInputStream.close();
                } catch (Throwable th) {
                    try {
                        fileInputStream.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                    throw th;
                    break;
                }
            } catch (FileNotFoundException e) {
                logger.error(e.getMessage(), e);
            } catch (IOException e2) {
                logger.error(e2.getMessage(), e2);
            }
        }
        return arrayList;
    }

    public List<Document> crawlDocuments(InputStream inputStream) throws IOException {
        return parseDocuments(inputStream);
    }

    private List<Document> parseDocuments(InputStream inputStream) throws IOException {
        ArrayList arrayList = new ArrayList();
        CSVParser parse = CSVParser.parse(inputStream, Charset.forName(this.encoding), CSVFormat.EXCEL.withFirstRecordAsHeader());
        String[] strArr = (String[]) parse.getHeaderMap().keySet().toArray(new String[0]);
        if (strArr.length > 0) {
            String str = strArr[0];
            if (str.startsWith(UnicodeUtils.BOM)) {
                strArr[0] = UnicodeUtils.removeBOM(str);
                logger.info("removed BOM");
            }
            for (int i = 0; i < strArr.length; i++) {
                strArr[i] = strArr[i].trim();
            }
        }
        for (CSVRecord cSVRecord : parse.getRecords()) {
            DefaultDocument defaultDocument = new DefaultDocument();
            if (this.add_data) {
                JsonArray jsonArray = new JsonArray();
                for (int i2 = 0; i2 < cSVRecord.size(); i2++) {
                    String str2 = cSVRecord.get(i2);
                    if (i2 == 0) {
                        str2 = UnicodeUtils.removeBOM(str2);
                    }
                    jsonArray.add(str2);
                }
                defaultDocument.putAttribute("data", jsonArray);
            }
            if (this.add_header) {
                JsonArray jsonArray2 = new JsonArray();
                for (String str3 : strArr) {
                    jsonArray2.add(str3);
                }
                defaultDocument.putAttribute("header", jsonArray2);
            }
            for (int i3 = 0; i3 < cSVRecord.size(); i3++) {
                String str4 = cSVRecord.get(i3);
                if (i3 < strArr.length) {
                    String trim = strArr[i3].trim();
                    if (i3 == 0) {
                        trim = UnicodeUtils.removeBOM(trim);
                    }
                    if (trim.trim().isEmpty()) {
                        trim = "header" + i3;
                    }
                    defaultDocument.putAttribute(trim, str4);
                } else {
                    defaultDocument.putAttribute("header" + i3, str4);
                }
            }
            arrayList.add(defaultDocument);
        }
        return arrayList;
    }

    @Override // nlp4j.crawler.AbstractFileCrawler, nlp4j.crawler.AbstractCrawler, nlp4j.crawler.Crawler
    public void setProperty(String str, String str2) {
        super.setProperty(str, str2);
        if ("add_header".equals(str)) {
            this.add_header = Boolean.parseBoolean(str2);
        } else if ("add_data".equals(str)) {
            this.add_data = Boolean.parseBoolean(str2);
        }
    }
}
