package ac.uk.susx.jack.tag.runner;

import ac.uk.susx.jack.tag.cluster.LDAClustering;
import ac.uk.susx.jack.tag.data.TFIDF;
import java.io.File;
import java.io.IOException;
import org.apache.spark.ml.clustering.LDAModel;
import org.apache.spark.ml.feature.CountVectorizer;
import org.apache.spark.sql.DataFrame;

/* loaded from: input_file:ac/uk/susx/jack/tag/runner/LDARunner.class */
public class LDARunner {
    public static CompositeModel<LDAModel> train(DataFrame dataFrame, int i, int i2, double d, double d2) {
        LDAModel train = new LDAClustering().train(dataFrame, i, 1.0d + d, 1.0d + d2, i2);
        System.out.println("log-likelihood: " + train.logLikelihood(dataFrame));
        System.out.println("perplexity: " + train.logPerplexity(dataFrame));
        return new CompositeModel<>(train.transform(dataFrame), train);
    }

    public static DataFrame readData(File file, int i) {
        try {
            DataFrame dataFrame = TFIDF.tokenise(TFIDF.basicDataFrame(TFIDF.readData(file, "txt"), TFIDF.basicSchema()), "document", "text");
            return new CountVectorizer().setMinTF(0.0d).setMinDF(0.0d).setVocabSize(i).setInputCol("text").setOutputCol("features").fit(dataFrame).transform(dataFrame);
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }
    }
}
