package net.sansa_stack.ml.spark.featureExtraction;

import net.sansa_stack.rdf.common.io.riot.error.ErrorParseMode$;
import net.sansa_stack.rdf.common.io.riot.error.WarningParseMode$;
import net.sansa_stack.rdf.spark.io.NTripleReader$;
import net.sansa_stack.rdf.spark.model.package$;
import org.apache.jena.sys.JenaSystem;
import org.apache.spark.ml.feature.StopWordsRemover;
import org.apache.spark.ml.feature.StringIndexer;
import org.apache.spark.ml.feature.Tokenizer;
import org.apache.spark.ml.feature.VectorAssembler;
import org.apache.spark.ml.feature.Word2Vec;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.SparkSession$;
import org.apache.spark.sql.functions$;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DoubleType$;
import scala.Predef$;
import scala.Predef$ArrowAssoc$;
import scala.Tuple2;
import scala.collection.ArrayOps$;
import scala.collection.SetOps;
import scala.collection.StringOps$;
import scala.collection.immutable.List;
import scala.collection.immutable.Nil$;
import scala.collection.immutable.Seq;
import scala.collection.mutable.Map;
import scala.collection.mutable.Map$;
import scala.reflect.ClassTag$;
import scala.runtime.BoxedUnit;
import scala.runtime.BoxesRunTime;
import scala.runtime.ObjectRef;
import scala.runtime.ScalaRunTime$;

/* compiled from: FeatureTypeIdentifier.scala */
/* loaded from: input_file:net/sansa_stack/ml/spark/featureExtraction/FeatureTypeIdentifier$.class */
public final class FeatureTypeIdentifier$ {
    public static final FeatureTypeIdentifier$ MODULE$ = new FeatureTypeIdentifier$();

    public void main(String[] strArr) {
        System.nanoTime();
        Predef$.MODULE$.println("\nSETUP SPARK SESSION");
        SparkSession orCreate = SparkSession$.MODULE$.builder().appName("SampleFeatureExtractionPipeline").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").config("spark.kryo.registrator", String.join(", ", "net.sansa_stack.rdf.spark.io.JenaKryoRegistrator", "net.sansa_stack.query.spark.sparqlify.KryoRegistratorSparqlify")).getOrCreate();
        orCreate.sparkContext().setLogLevel("ERROR");
        JenaSystem.init();
        Predef$.MODULE$.println("\nREAD IN DATA");
        Dataset<?> persist = package$.MODULE$.TripleOperations(NTripleReader$.MODULE$.load(orCreate, strArr[0], ErrorParseMode$.MODULE$.SKIP(), WarningParseMode$.MODULE$.IGNORE(), NTripleReader$.MODULE$.load$default$5(), NTripleReader$.MODULE$.load$default$6())).toDS().persist();
        Predef$.MODULE$.println("\nCREATE FEATURE EXTRACTING SPARQL");
        String stripMargin$extension = StringOps$.MODULE$.stripMargin$extension(Predef$.MODULE$.augmentString("\n        | SELECT\n        | ?movie\n        | ?movie__down_date\n        | ?movie__down_title\n        | ?movie__down_runtime\n        | ?movie__down_actor__down_actor_name\n        | ?movie__down_genre__down_film_genre_name\n        | ?movie__down_country__down_country_name\n        | ?movie__down_country__down_country_languages\n        | ?movie__down_country__down_country_areaInSqKm\n        |\n        |WHERE {\n        |\t# this fixes the entities, in this sample to be a movie\n        | ?movie <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://data.linkedmdb.org/movie/film> .\n        |\n        | # this is a optional block to gain only a smaller subset of entities, like Superhero-movies\n        | # ?movie <http://data.linkedmdb.org/movie/genre> ?movie__down_genre .\n        | # ?movie__down_genre <http://data.linkedmdb.org/movie/film_genre_name> \"Superhero\"\n        |\n        | # From here on we collect our Features\n        |\tOPTIONAL {\n        |\t\t?movie <http://purl.org/dc/terms/date> ?movie__down_date .\n        |\t}\n        |\n        |\tOPTIONAL {\n        |\t\t?movie <http://purl.org/dc/terms/title> ?movie__down_title .\n        |\t}\n        |\n        |\tOPTIONAL {\n        |\t\t?movie <http://data.linkedmdb.org/movie/runtime> ?movie__down_runtime .\n        |\t}\n        |\n        | OPTIONAL {\n        |\t\t?movie <http://data.linkedmdb.org/movie/actor> ?movie__down_actor .\n        |\t\t?movie__down_actor <http://data.linkedmdb.org/movie/actor_name> ?movie__down_actor__down_actor_name .\n        | }\n        |\n        | OPTIONAL {\n        |\t\t?movie <http://data.linkedmdb.org/movie/genre> ?movie__down_genre .\n        |\t\t?movie__down_genre <http://data.linkedmdb.org/movie/film_genre_name> ?movie__down_genre__down_film_genre_name .\n        |\t}\n        |\n        | OPTIONAL {\n        |\t\t?movie <http://data.linkedmdb.org/movie/country> ?movie__down_country .\n        |\t\t?movie__down_country <http://data.linkedmdb.org/movie/country_name> ?movie__down_country__down_country_name .\n        |\t}\n        |\n        | OPTIONAL {\n        |\t\t?movie <http://data.linkedmdb.org/movie/country> ?movie__down_country .\n        |\t\t?movie__down_country <http://data.linkedmdb.org/movie/country_languages> ?movie__down_country__down_country_languages .\n        |\t}\n        |\n        | OPTIONAL {\n        |\t\t?movie <http://data.linkedmdb.org/movie/country> ?movie__down_country .\n        |\t\t?movie__down_country <http://data.linkedmdb.org/movie/country_areaInSqKm> ?movie__down_country__down_country_areaInSqKm .\n        |\t}\n        |}\n    "));
        String str = strArr[1];
        String str2 = (str != null ? !str.equals("") : "" != 0) ? strArr[1] : stripMargin$extension;
        Predef$.MODULE$.println(str2);
        Predef$.MODULE$.println("\nFEATURE EXTRACTION OVER SPARQL");
        Dataset persist2 = new SparqlFrame().setSparqlQuery(str2).transform(persist).persist();
        persist2.show(false);
        Predef$.MODULE$.println("\nCOLLAPS COLUMNS & IDENTIFY FEATURE CHARACTERISTICS");
        String str3 = "movie";
        Seq seq = ((List) scala.package$.MODULE$.List().apply(Predef$.MODULE$.copyArrayToImmutableIndexedSeq(persist2.columns()))).filter(str4 -> {
            return BoxesRunTime.boxToBoolean($anonfun$main$1(str3, str4));
        }).toSeq();
        ObjectRef create = ObjectRef.create(persist2.select("movie", Nil$.MODULE$).dropDuplicates().persist());
        persist2.unpersist();
        long count = ((Dataset) create.elem).count();
        Predef$.MODULE$.println(new StringBuilder(24).append("Number distinct ids is: ").append(count).toString());
        ObjectRef create2 = ObjectRef.create((Map) Map$.MODULE$.apply(Nil$.MODULE$));
        seq.foreach(str5 -> {
            $anonfun$main$2(persist2, str3, count, create2, create, str5);
            return BoxedUnit.UNIT;
        });
        Predef$.MODULE$.println("\nCOLLAPSED DATAFRAME");
        ((Dataset) create.elem).show(false);
        Predef$.MODULE$.println("\nFEATURE CHARACTERISTICS");
        ((Map) create2.elem).foreach(tuple2 -> {
            $anonfun$main$3(tuple2);
            return BoxedUnit.UNIT;
        });
        Predef$.MODULE$.println("\nDIGITIZE FEATURES");
        Seq seq2 = ((List) scala.package$.MODULE$.List().apply(Predef$.MODULE$.copyArrayToImmutableIndexedSeq(((Dataset) create.elem).columns()))).filter(str6 -> {
            return BoxesRunTime.boxToBoolean($anonfun$main$4(str3, str6));
        }).toSeq();
        ObjectRef create3 = ObjectRef.create(((Dataset) create.elem).select("movie", Nil$.MODULE$).persist());
        ((Dataset) create.elem).unpersist();
        seq2.foreach(str7 -> {
            $anonfun$main$5(create, str3, create3, str7);
            return BoxedUnit.UNIT;
        });
        String[] columns = ((Dataset) create3.elem).columns();
        String[] strArr2 = (String[]) ArrayOps$.MODULE$.filter$extension(Predef$.MODULE$.refArrayOps(columns), str8 -> {
            return BoxesRunTime.boxToBoolean(str8.contains("(notDigitizedYet)"));
        });
        String[] strArr3 = (String[]) ArrayOps$.MODULE$.diff$extension(Predef$.MODULE$.refArrayOps(columns), Predef$.MODULE$.wrapRefArray(strArr2));
        if (ArrayOps$.MODULE$.size$extension(Predef$.MODULE$.refArrayOps(strArr2)) > 0) {
            Predef$.MODULE$.println(new StringBuilder(41).append("we drop following non digitized columns:\n").append(Predef$.MODULE$.wrapRefArray(strArr2).mkString("\n")).toString());
        }
        Dataset select = ((Dataset) create3.elem).select(Predef$.MODULE$.copyArrayToImmutableIndexedSeq(ArrayOps$.MODULE$.map$extension(Predef$.MODULE$.refArrayOps(strArr3), str9 -> {
            return functions$.MODULE$.col(str9);
        }, ClassTag$.MODULE$.apply(Column.class))));
        ((Dataset) create3.elem).unpersist();
        select.show();
        Predef$.MODULE$.println("FIX FEATURE LENGTH");
        String[] strArr4 = (String[]) ArrayOps$.MODULE$.filter$extension(Predef$.MODULE$.refArrayOps(select.columns()), str10 -> {
            return BoxesRunTime.boxToBoolean(str10.contains("ListOf"));
        });
        ObjectRef create4 = ObjectRef.create(select.select(Predef$.MODULE$.copyArrayToImmutableIndexedSeq(ArrayOps$.MODULE$.map$extension(Predef$.MODULE$.refArrayOps((Object[]) ArrayOps$.MODULE$.diff$extension(Predef$.MODULE$.refArrayOps(select.columns()), Predef$.MODULE$.wrapRefArray(strArr4))), str11 -> {
            return functions$.MODULE$.col(str11);
        }, ClassTag$.MODULE$.apply(Column.class)))).persist());
        ArrayOps$.MODULE$.foreach$extension(Predef$.MODULE$.refArrayOps(strArr4), str12 -> {
            $anonfun$main$10(select, str3, create4, str12);
            return BoxedUnit.UNIT;
        });
        ((Dataset) create4.elem).show(false);
        Predef$.MODULE$.println("ASSEMBLE VECTOR");
        String[] strArr5 = (String[]) ArrayOps$.MODULE$.filterNot$extension(Predef$.MODULE$.refArrayOps(((Dataset) create4.elem).columns()), str13 -> {
            return BoxesRunTime.boxToBoolean($anonfun$main$11(str3, str13));
        });
        Predef$.MODULE$.println(new StringBuilder(21).append("columns to assemble:\n").append(Predef$.MODULE$.wrapRefArray(strArr5).mkString(", ")).toString());
        Dataset persist3 = new VectorAssembler().setInputCols(strArr5).setOutputCol("features").transform((Dataset) create4.elem).persist();
        ((Dataset) create4.elem).unpersist();
        persist3.select("movie", ScalaRunTime$.MODULE$.wrapRefArray(new String[]{"features"})).show(false);
        Predef$.MODULE$.println(new StringBuilder(40).append("assembled vector has number of samples: ").append(persist3.count()).toString());
    }

    public static final /* synthetic */ boolean $anonfun$main$1(String str, String str2) {
        return !((SetOps) Predef$.MODULE$.Set().apply(ScalaRunTime$.MODULE$.wrapRefArray(new String[]{str}))).contains(str2);
    }

    public static final /* synthetic */ void $anonfun$main$2(Dataset dataset, String str, long j, ObjectRef objectRef, ObjectRef objectRef2, String str2) {
        Predef$.MODULE$.println(str2);
        Dataset dropDuplicates = dataset.select(str, ScalaRunTime$.MODULE$.wrapRefArray(new String[]{str2})).dropDuplicates();
        Dataset withColumn = dropDuplicates.groupBy(str, Nil$.MODULE$).agg(functions$.MODULE$.collect_list(str2).as(str2), Nil$.MODULE$).withColumn("size", functions$.MODULE$.size(functions$.MODULE$.col(str2)));
        int i = ((Row) withColumn.select("size", Nil$.MODULE$).agg(functions$.MODULE$.min("size"), Nil$.MODULE$).head()).getInt(0);
        int i2 = ((Row) withColumn.select("size", Nil$.MODULE$).agg(functions$.MODULE$.max("size"), Nil$.MODULE$).head()).getInt(0);
        boolean z = i == 0;
        DataType dataType = dropDuplicates.select(str2, Nil$.MODULE$).schema().apply(0).dataType();
        int count = (int) dropDuplicates.select(str2, Nil$.MODULE$).distinct().count();
        boolean z2 = i2 > 1;
        double count2 = withColumn.select("size", Nil$.MODULE$).filter(functions$.MODULE$.col("size").$greater(BoxesRunTime.boxToInteger(0))).count() / j;
        boolean z3 = ((double) count) / ((double) j) < 0.1d;
        String sb = z2 ? new StringBuilder(7).append("").append("ListOf_").toString() : new StringBuilder(7).append("").append("Single_").toString();
        String sb2 = new StringBuilder(0).append(z3 ? new StringBuilder(12).append(sb).append("Categorical_").toString() : new StringBuilder(15).append(sb).append("NonCategorical_").toString()).append(dataType.toString().split("Type")[0]).toString();
        ((Map) objectRef.elem).update(str2, (scala.collection.immutable.Map) Predef$.MODULE$.Map().apply(ScalaRunTime$.MODULE$.wrapRefArray(new Tuple2[]{Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("featureType"), sb2), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("name"), str2), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("nullable"), BoxesRunTime.boxToBoolean(z)), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("datatype"), dataType), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("numberDistinctValues"), BoxesRunTime.boxToInteger(count)), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("isListOfEntries"), BoxesRunTime.boxToBoolean(z2)), Predef$ArrowAssoc$.MODULE$.$minus$greater$extension(Predef$.MODULE$.ArrowAssoc("avalability"), BoxesRunTime.boxToDouble(count2))})));
        objectRef2.elem = ((Dataset) objectRef2.elem).join((z2 ? withColumn.select(str, ScalaRunTime$.MODULE$.wrapRefArray(new String[]{str2})) : dropDuplicates.select(str, ScalaRunTime$.MODULE$.wrapRefArray(new String[]{str2}))).withColumnRenamed(str2, new StringBuilder(2).append(str2).append("(").append(sb2).append(")").toString()), str);
    }

    public static final /* synthetic */ void $anonfun$main$3(Tuple2 tuple2) {
        Predef$.MODULE$.println(tuple2);
    }

    public static final /* synthetic */ boolean $anonfun$main$4(String str, String str2) {
        return !((SetOps) Predef$.MODULE$.Set().apply(ScalaRunTime$.MODULE$.wrapRefArray(new String[]{str}))).contains(str2);
    }

    public static final /* synthetic */ void $anonfun$main$5(ObjectRef objectRef, String str, ObjectRef objectRef2, String str2) {
        Dataset withColumnRenamed;
        String sb;
        String str3 = str2.split("\\(")[1].split("\\)")[0];
        String str4 = str2.split("\\(")[0];
        Predef$.MODULE$.println(str4);
        Predef$.MODULE$.println(str3);
        Dataset select = ((Dataset) objectRef.elem).select(str, ScalaRunTime$.MODULE$.wrapRefArray(new String[]{str2}));
        if (str3 != null ? str3.equals("Single_NonCategorical_String") : "Single_NonCategorical_String" == 0) {
            Dataset transform = new StopWordsRemover().setInputCol("words").setOutputCol("filtered").transform(new Tokenizer().setInputCol(str2).setOutputCol("words").transform(select.na().fill("")));
            withColumnRenamed = new Word2Vec().setInputCol("filtered").setOutputCol("output").setVectorSize(2).fit(transform).transform(transform);
            sb = new StringBuilder(10).append(str4).append("(Word2Vec)").toString();
        } else if (str3 != null ? str3.equals("ListOf_NonCategorical_String") : "ListOf_NonCategorical_String" == 0) {
            Dataset transform2 = new StopWordsRemover().setInputCol("words").setOutputCol("filtered").transform(new Tokenizer().setInputCol("sentences").setOutputCol("words").transform(select.withColumn("sentences", functions$.MODULE$.concat_ws(". ", ScalaRunTime$.MODULE$.wrapRefArray(new Column[]{functions$.MODULE$.col(str2)}))).na().fill("")));
            withColumnRenamed = new Word2Vec().setInputCol("filtered").setOutputCol("output").setVectorSize(2).fit(transform2).transform(transform2);
            sb = new StringBuilder(10).append(str4).append("(Word2Vec)").toString();
        } else if (str3 != null ? str3.equals("Single_Categorical_String") : "Single_Categorical_String" == 0) {
            Dataset fill = select.na().fill("");
            withColumnRenamed = new StringIndexer().setInputCol(str2).setOutputCol("output").fit(fill).transform(fill);
            sb = new StringBuilder(15).append(str4).append("(IndexedString)").toString();
        } else if (str3 != null ? str3.equals("ListOf_Categorical_String") : "ListOf_Categorical_String" == 0) {
            Dataset fill2 = select.select(ScalaRunTime$.MODULE$.wrapRefArray(new Column[]{functions$.MODULE$.col(str), functions$.MODULE$.explode_outer(functions$.MODULE$.col(str2))})).na().fill("");
            withColumnRenamed = new StringIndexer().setInputCol("col").setOutputCol("outputTmp").fit(fill2).transform(fill2).groupBy(str, Nil$.MODULE$).agg(functions$.MODULE$.collect_set("outputTmp").as("output"), Nil$.MODULE$).select(str, ScalaRunTime$.MODULE$.wrapRefArray(new String[]{"output"}));
            sb = new StringBuilder(21).append(str4).append("(ListOfIndexedString)").toString();
        } else if (str3.endsWith("Double")) {
            withColumnRenamed = select.withColumnRenamed(str2, "output").na().fill(-1.0d);
            sb = new StringBuilder(2).append(str4).append("(").append(str3).append(")").toString();
        } else if (str3.endsWith("Integer")) {
            withColumnRenamed = select.withColumn("output", functions$.MODULE$.col(str2).cast(DoubleType$.MODULE$)).na().fill(-1.0d);
            sb = new StringBuilder(2).append(str4).append("(").append(str3).append(")").toString();
        } else if (str3.endsWith("Boolean")) {
            withColumnRenamed = select.withColumn("output", functions$.MODULE$.col(str2).cast(DoubleType$.MODULE$)).na().fill(-1.0d);
            sb = new StringBuilder(2).append(str4).append("(").append(str3).append(")").toString();
        } else {
            Predef$.MODULE$.println("transformation not possible yet");
            withColumnRenamed = select.withColumnRenamed(str2, "output");
            sb = new StringBuilder(17).append(str4).append("(notDigitizedYet)").toString();
        }
        objectRef2.elem = ((Dataset) objectRef2.elem).join(withColumnRenamed.withColumnRenamed("output", sb).select(str, ScalaRunTime$.MODULE$.wrapRefArray(new String[]{sb})), str);
    }

    public static final /* synthetic */ void $anonfun$main$10(Dataset dataset, String str, ObjectRef objectRef, String str2) {
        Predef$.MODULE$.println(new StringBuilder(34).append("Fix number of features in column: ").append(str2).toString());
        String str3 = str2.split("\\(")[0];
        objectRef.elem = ((Dataset) objectRef.elem).join(dataset.select(str, ScalaRunTime$.MODULE$.wrapRefArray(new String[]{str2})).select(ScalaRunTime$.MODULE$.wrapRefArray(new Column[]{functions$.MODULE$.col(str), functions$.MODULE$.explode_outer(functions$.MODULE$.col(str2))})).groupBy(str, Nil$.MODULE$).agg(functions$.MODULE$.mean("col").alias(new StringBuilder(5).append(str3).append("_mean").toString()), ScalaRunTime$.MODULE$.wrapRefArray(new Column[]{functions$.MODULE$.min("col").alias(new StringBuilder(4).append(str3).append("_min").toString()), functions$.MODULE$.max("col").alias(new StringBuilder(4).append(str3).append("_max").toString()), functions$.MODULE$.stddev("col").alias(new StringBuilder(7).append(str3).append("_stddev").toString())})).na().fill(-1L), str);
    }

    public static final /* synthetic */ boolean $anonfun$main$11(String str, String str2) {
        return str2 != null ? str2.equals(str) : str == null;
    }

    private FeatureTypeIdentifier$() {
    }
}
