package ac.uk.susx.jack.tag.data;

import ac.uk.susx.jack.tag.context.Context;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.commons.io.FileUtils;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

/* loaded from: input_file:ac/uk/susx/jack/tag/data/AbstractDataReader.class */
public abstract class AbstractDataReader {
    public static JavaRDD<Row> readData(File file, String... strArr) throws IOException {
        ArrayList arrayList = new ArrayList();
        Iterator iterateFiles = FileUtils.iterateFiles(file, strArr, false);
        while (iterateFiles.hasNext()) {
            File file2 = (File) iterateFiles.next();
            arrayList.add(RowFactory.create(new Object[]{file2.getName(), FileUtils.readFileToString(file2, "UTF-8")}));
        }
        return Context.context().parallelize(arrayList);
    }

    public static StructType basicSchema() {
        return new StructType(new StructField[]{new StructField("name", DataTypes.StringType, false, Metadata.empty()), new StructField("document", DataTypes.StringType, false, Metadata.empty())});
    }

    public static DataFrame basicDataFrame(JavaRDD<Row> javaRDD, StructType structType) {
        return Context.sqlContext().createDataFrame(javaRDD, structType);
    }

    public static DataFrame tokenise(DataFrame dataFrame, String str, String str2) {
        return new Tokenizer().setInputCol(str).setOutputCol(str2).transform(dataFrame);
    }
}
