package org.lumongo.example.commoncrawl;

import com.mongodb.BasicDBObject;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Scanner;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.log4j.Logger;
import org.jets3t.service.S3ServiceException;
import org.jets3t.service.ServiceException;
import org.jets3t.service.impl.rest.httpclient.RestS3Service;
import org.jets3t.service.model.S3Object;
import org.jets3t.service.security.AWSCredentials;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.jwat.arc.ArcReader;
import org.jwat.arc.ArcReaderFactory;
import org.jwat.arc.ArcRecord;
import org.lumongo.client.command.CreateOrUpdateIndex;
import org.lumongo.client.command.Store;
import org.lumongo.client.config.IndexConfig;
import org.lumongo.client.config.LumongoPoolConfig;
import org.lumongo.client.pool.LumongoWorkPool;
import org.lumongo.cluster.message.Lumongo;
import org.lumongo.doc.ResultDocBuilder;
import org.lumongo.fields.FieldConfigBuilder;
import org.lumongo.util.LogUtil;
import org.lumongo.util.properties.PropertiesReader;

/* loaded from: input_file:org/lumongo/example/commoncrawl/IndexCommonCrawl.class */
public class IndexCommonCrawl {
    private static final String UID = "uid";
    private static final String URL = "url";
    private static final String CONTENTS = "contents";
    private static final String TEXT_CONTENTS = "textContents";
    private static final String TITLE = "title";
    private static final Logger log = Logger.getLogger(IndexCommonCrawl.class);
    private static final AtomicLong count = new AtomicLong();
    private static LumongoWorkPool lumongoWorkPool;

    public static void main(String[] strArr) throws Exception {
        if (strArr.length != 4) {
            System.err.println("usage: awsPropertiesFile prefix lumongoServers indexName");
            System.err.println("usage: aws.properties 2010/09/25/9 10.0.0.1,10.0.0.2 ccrawl");
            System.exit(1);
        }
        LogUtil.loadLogConfig();
        String str = strArr[0];
        String str2 = strArr[1];
        String[] split = strArr[2].split(",");
        final String str3 = strArr[3];
        LumongoPoolConfig lumongoPoolConfig = new LumongoPoolConfig();
        for (String str4 : split) {
            lumongoPoolConfig.addMember(str4);
        }
        PropertiesReader propertiesReader = new PropertiesReader(new File(str));
        final AWSCredentials aWSCredentials = new AWSCredentials(propertiesReader.getString("awsAccessKey"), propertiesReader.getString("awsSecretKey"));
        RestS3Service restS3Service = new RestS3Service(aWSCredentials);
        restS3Service.setRequesterPaysEnabled(true);
        System.out.println("Fetching files list for prefix <" + str2 + ">");
        System.out.println("This can take awhile ...");
        S3Object[] listObjects = restS3Service.listObjects("aws-publicdatasets", "common-crawl/crawl-002/" + str2, (String) null);
        System.out.println("Fetched info for <" + listObjects.length + "> files");
        lumongoWorkPool = new LumongoWorkPool(lumongoPoolConfig);
        IndexConfig indexConfig = new IndexConfig(CONTENTS);
        indexConfig.addFieldConfig(FieldConfigBuilder.create(URL).indexAs(Lumongo.LMAnalyzer.LC_KEYWORD));
        indexConfig.addFieldConfig(FieldConfigBuilder.create(TEXT_CONTENTS).indexAs(Lumongo.LMAnalyzer.STANDARD));
        indexConfig.addFieldConfig(FieldConfigBuilder.create(TITLE).indexAs(Lumongo.LMAnalyzer.STANDARD));
        lumongoWorkPool.createOrUpdateIndex(new CreateOrUpdateIndex(str3, 16, UID, indexConfig));
        ExecutorService newFixedThreadPool = Executors.newFixedThreadPool(16);
        for (S3Object s3Object : listObjects) {
            final String key = s3Object.getKey();
            newFixedThreadPool.execute(new Runnable() { // from class: org.lumongo.example.commoncrawl.IndexCommonCrawl.1
                @Override // java.lang.Runnable
                public void run() {
                    try {
                        IndexCommonCrawl.handleFile(str3, aWSCredentials, key);
                    } catch (Exception e) {
                        IndexCommonCrawl.log.error(e.getClass().getSimpleName() + ": ", e);
                    }
                }
            });
        }
        newFixedThreadPool.shutdown();
        lumongoWorkPool.shutdown();
        while (!newFixedThreadPool.isTerminated()) {
            newFixedThreadPool.awaitTermination(1L, TimeUnit.MINUTES);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static void handleFile(String str, AWSCredentials aWSCredentials, String str2) throws S3ServiceException, IOException, ServiceException {
        ArcReader arcReader = null;
        try {
            RestS3Service restS3Service = new RestS3Service(aWSCredentials);
            restS3Service.setRequesterPaysEnabled(true);
            arcReader = ArcReaderFactory.getReader(restS3Service.getObject("aws-publicdatasets", str2).getDataInputStream(), 16384);
            log.info("Opened <" + str2 + ">");
            arcReader.getVersionBlock();
            while (true) {
                ArcRecord nextRecord = arcReader.getNextRecord();
                if (nextRecord == null) {
                    if (arcReader != null) {
                        log.info("Closed <" + str2 + ">");
                        arcReader.close();
                        return;
                    }
                    return;
                }
                try {
                    String uri = nextRecord.getUrl().toString();
                    String str3 = null;
                    if (nextRecord.getUrl() != null) {
                        str3 = nextRecord.getUrl().toString();
                    }
                    if ("text/html".equals(nextRecord.getContentType())) {
                        byte[] bytes = getBytes(nextRecord.getPayload().getInputStream());
                        Store store = new Store(uri, str);
                        Scanner scanner = new Scanner(new ByteArrayInputStream(bytes));
                        Throwable th = null;
                        try {
                            try {
                                Document parse = Jsoup.parse(scanner.useDelimiter("\\A").next());
                                String text = parse.text();
                                String str4 = null;
                                try {
                                    Elements elementsByTag = parse.head().getElementsByTag(TITLE);
                                    if (!elementsByTag.isEmpty()) {
                                        str4 = elementsByTag.get(0).text();
                                    }
                                } catch (Exception e) {
                                }
                                if (str3 != null) {
                                    BasicDBObject basicDBObject = new BasicDBObject();
                                    basicDBObject.put(CONTENTS, bytes);
                                    basicDBObject.put(TEXT_CONTENTS, text);
                                    basicDBObject.put(TITLE, str4);
                                    basicDBObject.put(URL, str3);
                                    store.setResultDocument(new ResultDocBuilder().setDocument(basicDBObject));
                                    lumongoWorkPool.store(store);
                                }
                                if (scanner != null) {
                                    if (0 != 0) {
                                        try {
                                            scanner.close();
                                        } catch (Throwable th2) {
                                            th.addSuppressed(th2);
                                        }
                                    } else {
                                        scanner.close();
                                    }
                                }
                                long andIncrement = count.getAndIncrement();
                                if (andIncrement % 5000 == 0) {
                                    log.info("Indexed <" + andIncrement + ">");
                                }
                            } catch (Throwable th3) {
                                th = th3;
                                throw th3;
                                break;
                            }
                        } catch (Throwable th4) {
                            if (scanner != null) {
                                if (th != null) {
                                    try {
                                        scanner.close();
                                    } catch (Throwable th5) {
                                        th.addSuppressed(th5);
                                    }
                                } else {
                                    scanner.close();
                                }
                            }
                            throw th4;
                            break;
                        }
                    }
                } catch (Exception e2) {
                    log.warn(e2.getClass().getSimpleName() + ": " + e2);
                }
            }
        } catch (Throwable th6) {
            if (arcReader != null) {
                log.info("Closed <" + str2 + ">");
                arcReader.close();
            }
            throw th6;
        }
    }

    protected static byte[] getBytes(InputStream inputStream) throws IOException {
        try {
            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
            byte[] bArr = new byte[16384];
            while (true) {
                int read = inputStream.read(bArr, 0, bArr.length);
                if (read == -1) {
                    break;
                }
                byteArrayOutputStream.write(bArr, 0, read);
            }
            byteArrayOutputStream.flush();
            byte[] byteArray = byteArrayOutputStream.toByteArray();
            if (inputStream != null) {
                inputStream.close();
            }
            return byteArray;
        } catch (Throwable th) {
            if (inputStream != null) {
                inputStream.close();
            }
            throw th;
        }
    }
}
