package org.apache.spark.examples.mllib;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.examples.mllib.LDAExample;
import org.apache.spark.mllib.clustering.DistributedLDAModel;
import org.apache.spark.mllib.clustering.EMLDAOptimizer;
import org.apache.spark.mllib.clustering.LDA;
import org.apache.spark.mllib.clustering.OnlineLDAOptimizer;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.rdd.RDD;
import org.apache.spark.rdd.RDD$;
import scala.Array$;
import scala.MatchError;
import scala.Predef$;
import scala.StringContext;
import scala.Tuple2;
import scala.Tuple3;
import scala.collection.Seq;
import scala.collection.immutable.Map;
import scala.collection.immutable.Nil$;
import scala.collection.mutable.StringBuilder;
import scala.math.Numeric$LongIsIntegral$;
import scala.math.Ordering$Long$;
import scala.math.Ordering$String$;
import scala.reflect.ClassTag$;
import scala.runtime.BoxesRunTime;
import scala.runtime.ScalaRunTime$;
import scopt.OptionParser;
import scopt.Read$;

/* compiled from: LDAExample.scala */
/* loaded from: input_file:org/apache/spark/examples/mllib/LDAExample$.class */
public final class LDAExample$ {
    public static final LDAExample$ MODULE$ = null;

    static {
        new LDAExample$();
    }

    public void main(String[] strArr) {
        final LDAExample.Params params = new LDAExample.Params(LDAExample$Params$.MODULE$.apply$default$1(), LDAExample$Params$.MODULE$.apply$default$2(), LDAExample$Params$.MODULE$.apply$default$3(), LDAExample$Params$.MODULE$.apply$default$4(), LDAExample$Params$.MODULE$.apply$default$5(), LDAExample$Params$.MODULE$.apply$default$6(), LDAExample$Params$.MODULE$.apply$default$7(), LDAExample$Params$.MODULE$.apply$default$8(), LDAExample$Params$.MODULE$.apply$default$9(), LDAExample$Params$.MODULE$.apply$default$10());
        OptionParser<LDAExample.Params> optionParser = new OptionParser<LDAExample.Params>(params) { // from class: org.apache.spark.examples.mllib.LDAExample$$anon$1
            {
                super("LDAExample");
                head(Predef$.MODULE$.wrapRefArray(new String[]{"LDAExample: an example LDA app for plain text data."}));
                opt("k", Read$.MODULE$.intRead()).text(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"number of topics. default: ", ""})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToInteger(params.k())}))).action(new LDAExample$$anon$1$$anonfun$3(this));
                opt("maxIterations", Read$.MODULE$.intRead()).text(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"number of iterations of learning. default: ", ""})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToInteger(params.maxIterations())}))).action(new LDAExample$$anon$1$$anonfun$4(this));
                opt("docConcentration", Read$.MODULE$.doubleRead()).text(new StringBuilder().append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"amount of topic smoothing to use (> 1.0) (-1=auto)."})).s(Nil$.MODULE$)).append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"  default: ", ""})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToDouble(params.docConcentration())}))).toString()).action(new LDAExample$$anon$1$$anonfun$5(this));
                opt("topicConcentration", Read$.MODULE$.doubleRead()).text(new StringBuilder().append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"amount of term (word) smoothing to use (> 1.0) (-1=auto)."})).s(Nil$.MODULE$)).append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"  default: ", ""})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToDouble(params.topicConcentration())}))).toString()).action(new LDAExample$$anon$1$$anonfun$6(this));
                opt("vocabSize", Read$.MODULE$.intRead()).text(new StringBuilder().append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"number of distinct word types to use, chosen by frequency. (-1=all)"})).s(Nil$.MODULE$)).append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"  default: ", ""})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToInteger(params.vocabSize())}))).toString()).action(new LDAExample$$anon$1$$anonfun$7(this));
                opt("stopwordFile", Read$.MODULE$.stringRead()).text(new StringBuilder().append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"filepath for a list of stopwords. Note: This must fit on a single machine."})).s(Nil$.MODULE$)).append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"  default: ", ""})).s(Predef$.MODULE$.genericWrapArray(new Object[]{params.stopwordFile()}))).toString()).action(new LDAExample$$anon$1$$anonfun$8(this));
                opt("algorithm", Read$.MODULE$.stringRead()).text(new StringBuilder().append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"inference algorithm to use. em and online are supported."})).s(Nil$.MODULE$)).append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{" default: ", ""})).s(Predef$.MODULE$.genericWrapArray(new Object[]{params.algorithm()}))).toString()).action(new LDAExample$$anon$1$$anonfun$9(this));
                opt("checkpointDir", Read$.MODULE$.stringRead()).text(new StringBuilder().append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"Directory for checkpointing intermediate results."})).s(Nil$.MODULE$)).append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"  Checkpointing helps with recovery and eliminates temporary shuffle files on disk."})).s(Nil$.MODULE$)).append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"  default: ", ""})).s(Predef$.MODULE$.genericWrapArray(new Object[]{params.checkpointDir()}))).toString()).action(new LDAExample$$anon$1$$anonfun$10(this));
                opt("checkpointInterval", Read$.MODULE$.intRead()).text(new StringBuilder().append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"Iterations between each checkpoint.  Only used if checkpointDir is set."})).s(Nil$.MODULE$)).append(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{" default: ", ""})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToInteger(params.checkpointInterval())}))).toString()).action(new LDAExample$$anon$1$$anonfun$11(this));
                arg("<input>...", Read$.MODULE$.stringRead()).text("input paths (directories) to plain text corpora.  Each text file line should hold 1 document.").unbounded().required().action(new LDAExample$$anon$1$$anonfun$12(this));
            }
        };
        optionParser.parse(Predef$.MODULE$.wrapRefArray(strArr), params).map(new LDAExample$$anonfun$main$1()).getOrElse(new LDAExample$$anonfun$main$2(optionParser));
    }

    public void org$apache$spark$examples$mllib$LDAExample$$run(LDAExample.Params params) {
        EMLDAOptimizer miniBatchFraction;
        SparkContext sparkContext = new SparkContext(new SparkConf().setAppName(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"LDAExample with ", ""})).s(Predef$.MODULE$.genericWrapArray(new Object[]{params}))));
        Logger.getRootLogger().setLevel(Level.WARN);
        long nanoTime = System.nanoTime();
        Tuple3<RDD<Tuple2<Object, Vector>>, String[], Object> preprocess = preprocess(sparkContext, params.input(), params.vocabSize(), params.stopwordFile());
        if (preprocess == null) {
            throw new MatchError(preprocess);
        }
        Tuple3 tuple3 = new Tuple3((RDD) preprocess._1(), (String[]) preprocess._2(), BoxesRunTime.boxToLong(BoxesRunTime.unboxToLong(preprocess._3())));
        RDD rdd = (RDD) tuple3._1();
        String[] strArr = (String[]) tuple3._2();
        long unboxToLong = BoxesRunTime.unboxToLong(tuple3._3());
        rdd.cache();
        long count = rdd.count();
        int size = Predef$.MODULE$.refArrayOps(strArr).size();
        Predef$.MODULE$.println();
        Predef$.MODULE$.println(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"Corpus summary:"})).s(Nil$.MODULE$));
        Predef$.MODULE$.println(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"\\t Training set size: ", " documents"})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToLong(count)})));
        Predef$.MODULE$.println(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"\\t Vocabulary size: ", " terms"})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToInteger(size)})));
        Predef$.MODULE$.println(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"\\t Training set size: ", " tokens"})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToLong(unboxToLong)})));
        Predef$.MODULE$.println(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"\\t Preprocessing time: ", " sec"})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToDouble((System.nanoTime() - nanoTime) / 1.0E9d)})));
        Predef$.MODULE$.println();
        LDA lda = new LDA();
        String lowerCase = params.algorithm().toLowerCase();
        if ("em" != 0 ? "em".equals(lowerCase) : lowerCase == null) {
            miniBatchFraction = new EMLDAOptimizer();
        } else {
            if ("online" != 0 ? !"online".equals(lowerCase) : lowerCase != null) {
                throw new IllegalArgumentException(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"Only em, online are supported but got ", "."})).s(Predef$.MODULE$.genericWrapArray(new Object[]{params.algorithm()})));
            }
            miniBatchFraction = new OnlineLDAOptimizer().setMiniBatchFraction(0.05d + (1.0d / count));
        }
        lda.setOptimizer(miniBatchFraction).setK(params.k()).setMaxIterations(params.maxIterations()).setDocConcentration(params.docConcentration()).setTopicConcentration(params.topicConcentration()).setCheckpointInterval(params.checkpointInterval());
        if (params.checkpointDir().nonEmpty()) {
            sparkContext.setCheckpointDir((String) params.checkpointDir().get());
        }
        long nanoTime2 = System.nanoTime();
        DistributedLDAModel run = lda.run(rdd);
        Predef$.MODULE$.println(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"Finished training LDA model.  Summary:"})).s(Nil$.MODULE$));
        Predef$.MODULE$.println(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"\\t Training time: ", " sec"})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToDouble((System.nanoTime() - nanoTime2) / 1.0E9d)})));
        if (run instanceof DistributedLDAModel) {
            Predef$.MODULE$.println(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"\\t Training data average log likelihood: ", ""})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToDouble(run.logLikelihood() / count)})));
            Predef$.MODULE$.println();
        }
        Tuple2[][] tuple2Arr = (Tuple2[][]) Predef$.MODULE$.refArrayOps(run.describeTopics(10)).map(new LDAExample$$anonfun$13(strArr), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(ScalaRunTime$.MODULE$.arrayClass(Tuple2.class))));
        Predef$.MODULE$.println(new StringContext(Predef$.MODULE$.wrapRefArray(new String[]{"", " topics:"})).s(Predef$.MODULE$.genericWrapArray(new Object[]{BoxesRunTime.boxToInteger(params.k())})));
        Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps(tuple2Arr).zipWithIndex(Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple2.class)))).foreach(new LDAExample$$anonfun$org$apache$spark$examples$mllib$LDAExample$$run$1());
        sparkContext.stop();
    }

    private Tuple3<RDD<Tuple2<Object, Vector>>, String[], Object> preprocess(SparkContext sparkContext, Seq<String> seq, int i, String str) {
        RDD map = sparkContext.textFile(seq.mkString(","), sparkContext.textFile$default$2()).zipWithIndex().map(new LDAExample$$anonfun$14(new SimpleTokenizer(sparkContext, str)), ClassTag$.MODULE$.apply(Tuple2.class));
        map.cache();
        RDD reduceByKey = RDD$.MODULE$.rddToPairRDDFunctions(map.flatMap(new LDAExample$$anonfun$15(), ClassTag$.MODULE$.apply(Tuple2.class)), ClassTag$.MODULE$.apply(String.class), ClassTag$.MODULE$.Long(), Ordering$String$.MODULE$).reduceByKey(new LDAExample$$anonfun$1());
        reduceByKey.cache();
        Tuple2[] tuple2Arr = (i == -1 || reduceByKey.count() <= ((long) i)) ? (Tuple2[]) Predef$.MODULE$.refArrayOps((Object[]) reduceByKey.collect()).sortBy(new LDAExample$$anonfun$16(), Ordering$Long$.MODULE$) : (Tuple2[]) reduceByKey.sortBy(new LDAExample$$anonfun$17(), false, reduceByKey.sortBy$default$3(), Ordering$Long$.MODULE$, ClassTag$.MODULE$.Long()).take(i);
        Tuple2 tuple2 = new Tuple2(Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps((Object[]) Predef$.MODULE$.refArrayOps(tuple2Arr).map(new LDAExample$$anonfun$18(), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(String.class)))).zipWithIndex(Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.apply(Tuple2.class)))).toMap(Predef$.MODULE$.conforms()), Predef$.MODULE$.longArrayOps((long[]) Predef$.MODULE$.refArrayOps(tuple2Arr).map(new LDAExample$$anonfun$19(), Array$.MODULE$.canBuildFrom(ClassTag$.MODULE$.Long()))).sum(Numeric$LongIsIntegral$.MODULE$));
        if (tuple2 != null) {
            Map map2 = (Map) tuple2._1();
            long _2$mcJ$sp = tuple2._2$mcJ$sp();
            if (map2 != null) {
                Tuple2 tuple22 = new Tuple2(map2, BoxesRunTime.boxToLong(_2$mcJ$sp));
                Map map3 = (Map) tuple22._1();
                long _2$mcJ$sp2 = tuple22._2$mcJ$sp();
                RDD map4 = map.map(new LDAExample$$anonfun$20(map3), ClassTag$.MODULE$.apply(Tuple2.class));
                String[] strArr = new String[map3.size()];
                map3.foreach(new LDAExample$$anonfun$preprocess$1(strArr));
                return new Tuple3<>(map4, strArr, BoxesRunTime.boxToLong(_2$mcJ$sp2));
            }
        }
        throw new MatchError(tuple2);
    }

    private LDAExample$() {
        MODULE$ = this;
    }
}
