package org.imixs.archive.documents;

import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import javax.ejb.Stateless;
import javax.inject.Inject;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.eclipse.microprofile.config.inject.ConfigProperty;
import org.imixs.workflow.FileData;
import org.imixs.workflow.ItemCollection;
import org.imixs.workflow.exceptions.PluginException;

@Stateless
/* loaded from: input_file:org/imixs/archive/documents/TikaService.class */
public class TikaService {
    public static final String FILE_ATTRIBUTE_TEXT = "text";
    public static final String DEFAULT_ENCODING = "UTF-8";
    public static final String PLUGIN_ERROR = "PLUGIN_ERROR";
    public static final String ENV_OCR_SERVICE_ENDPOINT = "ocr.service.endpoint";
    public static final String ENV_OCR_SERVICE_MODE = "ocr.service.mode";
    public static final String ENV_OCR_SERVICE_MAXFILESIZE = "ocr.service.maxfilesize";
    public static final String ENV_OCR_STRATEGY = "ocr.strategy";
    public static final String OCR_STRATEGY_NO_OCR = "NO_OCR";
    public static final String OCR_STRATEGY_OCR_AND_TEXT_EXTRACTION = "OCR_AND_TEXT_EXTRACTION";
    public static final String OCR_STRATEGY_OCR_ONLY = "OCR_ONLY";
    public static final String OCR_STRATEGY_AUTO = "AUTO";
    private static Logger logger = Logger.getLogger(TikaService.class.getName());

    @Inject
    @ConfigProperty(name = ENV_OCR_SERVICE_ENDPOINT)
    Optional<String> serviceEndpoint;

    @Inject
    @ConfigProperty(name = ENV_OCR_STRATEGY, defaultValue = OCR_STRATEGY_AUTO)
    String ocrStategy;

    @Inject
    @ConfigProperty(name = ENV_OCR_SERVICE_MAXFILESIZE, defaultValue = "5242880")
    int ocrMaxFileSize;

    public void extractText(ItemCollection itemCollection, ItemCollection itemCollection2) throws PluginException {
        extractText(itemCollection, itemCollection2, this.ocrStategy, null, null, 0);
    }

    public void extractText(ItemCollection itemCollection, ItemCollection itemCollection2, String str, List<String> list, String str2, int i) throws PluginException {
        FileData fetchOriginFileData;
        boolean isLoggable = logger.isLoggable(Level.FINE);
        Pattern pattern = null;
        if (list == null) {
            list = new ArrayList();
        }
        if (str != null) {
            this.ocrStategy = str;
        }
        if ("AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION".indexOf(this.ocrStategy) == -1) {
            throw new PluginException(TikaService.class.getSimpleName(), "PLUGIN_ERROR", "Invalid TIKA_OCR_MODE - expected one of the following options: NO_OCR | OCR_ONLY | OCR_AND_TEXT_EXTRACTION");
        }
        if (!list.stream().anyMatch(str3 -> {
            return str3.toLowerCase().startsWith("X-Tika-PDFOcrStrategy=".toLowerCase());
        })) {
            list.add("X-Tika-PDFOcrStrategy=" + this.ocrStategy);
        }
        if (isLoggable) {
            Iterator<String> it = list.iterator();
            while (it.hasNext()) {
                logger.info("......  Tika Option = " + it.next());
            }
        }
        if (str2 != null && !str2.isEmpty()) {
            pattern = Pattern.compile(str2);
        }
        long currentTimeMillis = System.currentTimeMillis();
        for (FileData fileData : itemCollection.getFileData()) {
            if (pattern == null || pattern.matcher(fileData.getName()).find()) {
                if (!hasOCRContent(fileData) && (fetchOriginFileData = fetchOriginFileData(fileData, itemCollection2)) != null) {
                    if (isLoggable) {
                        try {
                            logger.fine("...text extraction '" + fetchOriginFileData.getName() + "'...");
                        } catch (IOException e) {
                            throw new PluginException(TikaService.class.getSimpleName(), "PLUGIN_ERROR", "Unable to scan attached document '" + fileData.getName() + "'", e);
                        }
                    }
                    if (fetchOriginFileData.getContent() == null || fetchOriginFileData.getContent().length <= this.ocrMaxFileSize) {
                        String doORCProcessing = doORCProcessing(fetchOriginFileData, list, i);
                        if (doORCProcessing == null) {
                            logger.warning("Unable to extract text-content for '" + fileData.getName() + "'");
                            doORCProcessing = "";
                        }
                        ArrayList arrayList = new ArrayList();
                        arrayList.add(doORCProcessing);
                        fileData.setAttribute(FILE_ATTRIBUTE_TEXT, arrayList);
                    } else {
                        logger.warning("The file size '" + fileData.getName() + "' excided the allowed max size of " + this.ocrMaxFileSize + " bytes (file size=" + fetchOriginFileData.getContent().length + ")");
                    }
                }
            }
        }
        if (isLoggable) {
            logger.fine("...extracted textual information in " + (System.currentTimeMillis() - currentTimeMillis) + "ms");
        }
    }

    public String doORCProcessing(FileData fileData, List<String> list, int i) throws IOException {
        boolean isLoggable = logger.isLoggable(Level.FINE);
        if (!this.serviceEndpoint.isPresent() || this.serviceEndpoint.get().isEmpty()) {
            logger.severe("OCR_SERVICE_ENDPOINT is missing - OCR processing not supported without a valid tika server endpoint!");
            return null;
        }
        if (isLoggable) {
            logger.fine("...ocr scanning....");
        }
        String adaptContentType = adaptContentType(fileData);
        if (!acceptContentType(adaptContentType)) {
            if (!isLoggable) {
                return null;
            }
            logger.fine("contentType '" + adaptContentType + " is not supported by Tika Server");
            return null;
        }
        if (i > 0 && "application/pdf".equals(adaptContentType)) {
            PDDocument load = PDDocument.load(fileData.getContent());
            if (load.getNumberOfPages() > i) {
                logger.info("......pdf document '" + fileData.getName() + "' has to many pages (max allowed=" + i + ")");
                while (load.getNumberOfPages() > i) {
                    logger.info("......removing page " + load.getNumberOfPages());
                    load.removePage(load.getNumberOfPages() - 1);
                }
                ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
                load.save(byteArrayOutputStream);
                load.close();
                fileData.setContent(byteArrayOutputStream.toByteArray());
            }
        }
        PrintWriter printWriter = null;
        try {
            HttpURLConnection httpURLConnection = (HttpURLConnection) new URL(this.serviceEndpoint.get()).openConnection();
            httpURLConnection.setRequestMethod("PUT");
            httpURLConnection.setDoOutput(true);
            httpURLConnection.setDoInput(true);
            httpURLConnection.setAllowUserInteraction(false);
            httpURLConnection.setRequestProperty("Content-Type", adaptContentType + "; charset=UTF-8");
            httpURLConnection.setRequestProperty("Accept", "text/plain");
            if (list != null && list.size() > 0) {
                for (String str : list) {
                    int indexOf = str.indexOf("=");
                    if (indexOf > -1) {
                        String substring = str.substring(0, indexOf);
                        String substring2 = str.substring(indexOf + 1);
                        if (substring.startsWith("X-Tika")) {
                            httpURLConnection.setRequestProperty(substring, substring2);
                        } else {
                            logger.warning("Invalid tika option : '" + str + "'  key must start with 'X-Tika'");
                        }
                    } else {
                        logger.warning("Invalid tika option : '" + str + "'  character '=' expeced!");
                    }
                }
            }
            httpURLConnection.setRequestProperty("Content-Length", "" + Integer.valueOf(fileData.getContent().length));
            OutputStream outputStream = httpURLConnection.getOutputStream();
            PrintWriter printWriter2 = new PrintWriter((Writer) new OutputStreamWriter(outputStream, "UTF-8"), true);
            outputStream.write(fileData.getContent());
            printWriter2.flush();
            int responseCode = httpURLConnection.getResponseCode();
            if (responseCode < 200 || responseCode > 299) {
                return null;
            }
            String readResponse = readResponse(httpURLConnection, "UTF-8");
            if (0 != 0) {
                printWriter.close();
            }
            return readResponse;
        } finally {
            if (0 != 0) {
                printWriter.close();
            }
        }
    }

    private boolean hasOCRContent(FileData fileData) {
        List list;
        return (fileData == null || (list = (List) fileData.getAttribute(FILE_ATTRIBUTE_TEXT)) == null || list.size() <= 0 || list.get(0) == null) ? false : true;
    }

    private FileData fetchOriginFileData(FileData fileData, ItemCollection itemCollection) {
        FileData fileData2;
        byte[] content;
        byte[] content2 = fileData.getContent();
        if (content2 != null && content2.length > 1) {
            return fileData;
        }
        if (itemCollection != null && (fileData2 = itemCollection.getFileData(fileData.getName())) != null && (content = fileData2.getContent()) != null && content.length > 1) {
            return fileData2;
        }
        logger.warning("no content found for fileData '" + fileData.getName() + "'!");
        return null;
    }

    private String readResponse(URLConnection uRLConnection, String str) throws IOException {
        boolean isLoggable = logger.isLoggable(Level.FINE);
        if (isLoggable) {
            logger.finest("......readResponse....");
        }
        StringWriter stringWriter = new StringWriter();
        BufferedReader bufferedReader = null;
        try {
            try {
                String contentEncoding = uRLConnection.getContentEncoding();
                if ((contentEncoding == null || contentEncoding.isEmpty()) && str != null && !str.isEmpty()) {
                    contentEncoding = str;
                }
                bufferedReader = (contentEncoding == null || contentEncoding.isEmpty()) ? new BufferedReader(new InputStreamReader(uRLConnection.getInputStream())) : new BufferedReader(new InputStreamReader(uRLConnection.getInputStream(), contentEncoding));
                while (true) {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        break;
                    }
                    if (isLoggable) {
                        logger.finest("......" + readLine);
                    }
                    stringWriter.write(readLine + "\n");
                }
                if (bufferedReader != null) {
                    bufferedReader.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
                if (bufferedReader != null) {
                    bufferedReader.close();
                }
            }
            return stringWriter.toString();
        } catch (Throwable th) {
            if (bufferedReader != null) {
                bufferedReader.close();
            }
            throw th;
        }
    }

    private boolean acceptContentType(String str) {
        return (str == null || str.isEmpty() || "application/octet-stream".equalsIgnoreCase(str)) ? false : true;
    }

    private String adaptContentType(FileData fileData) {
        String contentType = fileData.getContentType();
        if (contentType == null || contentType.isEmpty() || "*/*".equals(contentType)) {
            contentType = fileData.getName().toLowerCase().endsWith(".pdf") ? "application/pdf" : "application/xml";
        }
        return contentType;
    }
}
