package org.elasticsearch.xpack.core.ml.inference.preprocessing.customwordembedding;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.regex.Pattern;
import org.elasticsearch.xpack.core.common.notifications.AbstractAuditor;

/* loaded from: input_file:org/elasticsearch/xpack/core/ml/inference/preprocessing/customwordembedding/FeatureUtils.class */
public final class FeatureUtils {
    private static final Pattern NOT_UNICODE_OR_IS_SPECIAL = Pattern.compile("[^\\p{L}|\\p{M}|\\s]|\\||\\p{InSpacing_Modifier_Letters}");
    private static final Pattern ONE_OR_MORE_WHITESPACE = Pattern.compile("\\p{IsWhite_Space}+");
    private static final Pattern TURKISH_I = Pattern.compile("\\u0130");

    private FeatureUtils() {
    }

    public static String truncateToNumValidBytes(String str, int i) {
        if (str == null) {
            return null;
        }
        byte[] bytes = str.getBytes(StandardCharsets.UTF_8);
        if (bytes.length <= i) {
            return str;
        }
        CharsetDecoder newDecoder = StandardCharsets.UTF_8.newDecoder();
        ByteBuffer wrap = ByteBuffer.wrap(bytes, 0, i);
        CharBuffer allocate = CharBuffer.allocate(i);
        newDecoder.onMalformedInput(CodingErrorAction.IGNORE);
        newDecoder.decode(wrap, allocate, true);
        newDecoder.flush(allocate);
        return new String(allocate.array(), 0, allocate.position());
    }

    public static String cleanAndLowerText(String str) {
        return TURKISH_I.matcher(ONE_OR_MORE_WHITESPACE.matcher(((str.startsWith(" ") ? AbstractAuditor.All_RESOURCES_ID : " ") + NOT_UNICODE_OR_IS_SPECIAL.matcher(str).replaceAll(" ")) + (str.endsWith(" ") ? AbstractAuditor.All_RESOURCES_ID : " ")).replaceAll(" ")).replaceAll("I").toLowerCase(Locale.ROOT);
    }
}
