package com.litongjava.maxkb.service;

import com.jfinal.kit.Kv;
import com.jfinal.template.Engine;
import com.litongjava.db.TableInput;
import com.litongjava.db.activerecord.Db;
import com.litongjava.db.activerecord.Row;
import com.litongjava.maxkb.utils.ExecutorServiceUtils;
import com.litongjava.maxkb.vo.UploadResultVo;
import com.litongjava.model.result.ResultVo;
import com.litongjava.openai.chat.ChatResponseUsage;
import com.litongjava.openai.chat.Choice;
import com.litongjava.openai.chat.OpenAiChatResponseVo;
import com.litongjava.openai.client.OpenAiClient;
import com.litongjava.openai.token.OpenAiTokenizer;
import com.litongjava.table.services.ApiTable;
import com.litongjava.tio.utils.crypto.Md5Utils;
import com.litongjava.tio.utils.environment.EnvUtils;
import com.litongjava.tio.utils.hutool.FileUtil;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.splitter.DocumentSplitters;
import dev.langchain4j.data.segment.TextSegment;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.Future;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/litongjava/maxkb/service/DatasetDocumentSplitService.class */
public class DatasetDocumentSplitService {
    private static final Logger log = LoggerFactory.getLogger(DatasetDocumentSplitService.class);
    private static final int MAX_HEIGHT = 2200;
    private final Object lock = new Object();

    public ResultVo split(byte[] bArr, UploadResultVo uploadResultVo) throws IOException, InterruptedException, ExecutionException {
        String filename = uploadResultVo.getFilename();
        List split = DocumentSplitters.recursive(1000, 400, new OpenAiTokenizer()).split(new Document(toMarkdown(EnvUtils.getStr("OPENAI_API_KEY"), bArr, "png")));
        Kv kv = Kv.by("name", filename).set("id", uploadResultVo.getId());
        ArrayList arrayList = new ArrayList();
        Iterator it = split.iterator();
        while (it.hasNext()) {
            arrayList.add(Kv.by("title", "").set("content", ((TextSegment) it.next()).text()));
        }
        kv.set("content", arrayList);
        ArrayList arrayList2 = new ArrayList();
        arrayList2.add(kv);
        return ResultVo.ok(arrayList2);
    }

    private String toMarkdown(String str, byte[] bArr, String str2) throws IOException, InterruptedException, ExecutionException {
        String digestHex = Md5Utils.digestHex(bArr);
        log.info("Processing document with MD5: {}", digestHex);
        Row row = (Row) ApiTable.get("max_kb_document_markdown_cache", TableInput.create().columns("target,content").set("id", digestHex)).getData();
        boolean z = false;
        if (row != null) {
            String str3 = row.getStr("target");
            if (str3 != null) {
                z = true;
                File file = new File(str3);
                if (file.exists()) {
                    log.info("Markdown found in cache at {}", str3);
                    return FileUtil.readString(file);
                }
            }
            String str4 = row.getStr("content");
            if (str4 != null) {
                log.info("Markdown content found in cache");
                return str4;
            }
        }
        ArrayList<byte[]> arrayList = new ArrayList();
        PDDocument load = PDDocument.load(new ByteArrayInputStream(bArr));
        Throwable th = null;
        try {
            try {
                int numberOfPages = load.getNumberOfPages();
                PDFRenderer pDFRenderer = new PDFRenderer(load);
                for (int i = 0; i < numberOfPages; i++) {
                    arrayList.addAll(toBytes(pDFRenderer.renderImageWithDPI(i, 144.0f), str2));
                }
                if (load != null) {
                    if (0 != 0) {
                        try {
                            load.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        load.close();
                    }
                }
                ExecutorCompletionService executorCompletionService = new ExecutorCompletionService(ExecutorServiceUtils.getExecutorService());
                ArrayList arrayList2 = new ArrayList();
                for (byte[] bArr2 : arrayList) {
                    arrayList2.add(executorCompletionService.submit(() -> {
                        return convertPdfPageToMarkdown(str, bArr2, str2);
                    }));
                }
                ArrayList arrayList3 = new ArrayList(Collections.nCopies(arrayList.size(), null));
                for (int i2 = 0; i2 < arrayList2.size(); i2++) {
                    Future take = executorCompletionService.take();
                    arrayList3.set(arrayList2.indexOf(take), take.get());
                }
                StringBuilder sb = new StringBuilder();
                Iterator it = arrayList3.iterator();
                while (it.hasNext()) {
                    sb.append((String) it.next());
                }
                String str5 = "markdowns/" + digestHex + ".md";
                new File(str5).getParentFile().mkdirs();
                FileUtil.writeString(sb.toString(), str5, "UTF-8");
                log.info("Markdown saved to {}", str5);
                if (z) {
                    Db.update("max_kb_document_markdown_cache", Row.by("id", digestHex).set("target", str5).set("content", sb));
                    log.info("Cache updated for document MD5: {}", digestHex);
                } else {
                    Db.save("max_kb_document_markdown_cache", Row.by("id", digestHex).set("target", str5).set("content", sb));
                    log.info("Cache saved for new document MD5: {}", digestHex);
                }
                return sb.toString();
            } finally {
            }
        } catch (Throwable th3) {
            if (load != null) {
                if (th != null) {
                    try {
                        load.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    load.close();
                }
            }
            throw th3;
        }
    }

    private List<byte[]> toBytes(BufferedImage bufferedImage, String str) throws IOException {
        ArrayList arrayList = new ArrayList();
        if (bufferedImage.getHeight() <= MAX_HEIGHT) {
            ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
            ImageIO.write(bufferedImage, str, byteArrayOutputStream);
            arrayList.add(byteArrayOutputStream.toByteArray());
        } else {
            int ceil = (int) Math.ceil(bufferedImage.getHeight() / 2200.0d);
            for (int i = 0; i < ceil; i++) {
                int i2 = i * MAX_HEIGHT;
                BufferedImage subimage = bufferedImage.getSubimage(0, i2, bufferedImage.getWidth(), Math.min(MAX_HEIGHT, bufferedImage.getHeight() - i2));
                ByteArrayOutputStream byteArrayOutputStream2 = new ByteArrayOutputStream();
                ImageIO.write(subimage, str, byteArrayOutputStream2);
                arrayList.add(byteArrayOutputStream2.toByteArray());
                log.debug("Image split into part {}/{}", Integer.valueOf(i + 1), Integer.valueOf(ceil));
            }
        }
        return arrayList;
    }

    private String convertPdfPageToMarkdown(String str, byte[] bArr, String str2) throws IOException {
        OpenAiChatResponseVo chatWithImage;
        String digestHex = Md5Utils.digestHex(bArr);
        String format = String.format("SELECT content FROM %s WHERE id=?", "max_kb_document_markdown_page_cache");
        String queryStr = Db.queryStr(format, new Object[]{digestHex});
        if (queryStr != null) {
            log.debug("Content found in page cache for ID: {}", digestHex);
            return queryStr;
        }
        String str3 = "images/" + (digestHex + "." + str2);
        File file = new File(str3);
        file.getParentFile().mkdirs();
        FileUtil.writeBytes(bArr, file);
        log.debug("Image saved to {}", str3);
        long currentTimeMillis = System.currentTimeMillis();
        String renderToString = Engine.use().getTemplate("image_to_text_prompt.txt").renderToString();
        try {
            chatWithImage = OpenAiClient.chatWithImage(str, renderToString, bArr, str2);
        } catch (Exception e) {
            try {
                chatWithImage = OpenAiClient.chatWithImage(str, renderToString, bArr, str2);
            } catch (Exception e2) {
                chatWithImage = OpenAiClient.chatWithImage(str, renderToString, bArr, str2);
            }
        }
        String content = ((Choice) chatWithImage.getChoices().get(0)).getMessage().getContent();
        if (content.startsWith("```markdown")) {
            content = content.substring(11, content.length() - 3);
        }
        ChatResponseUsage usage = chatWithImage.getUsage();
        TableInput tableInput = TableInput.by("id", digestHex).set("target", str3).set("content", content).set("elapsed", Long.valueOf(System.currentTimeMillis() - currentTimeMillis)).set("model", "gpt-4o").set("system_fingerprint", chatWithImage.getSystem_fingerprint()).set("completion_tokens", usage.getCompletion_tokens()).set("prompt_tokens", usage.getPrompt_tokens()).set("total_tokens", usage.getTotal_tokens());
        String queryStr2 = Db.queryStr(format, new Object[]{digestHex});
        if (queryStr2 != null) {
            log.debug("Content found in page cache during save for ID: {}", digestHex);
            return queryStr2;
        }
        synchronized (this.lock) {
            String queryStr3 = Db.queryStr(format, new Object[]{digestHex});
            if (queryStr3 != null) {
                return queryStr3;
            }
            ApiTable.save("max_kb_document_markdown_page_cache", tableInput);
            log.debug("Content cached for page ID: {}", digestHex);
            return content;
        }
    }
}
