@@ -121,10 +121,9 @@ import org.apache.spark.ml.classification.RandomForestClassifier
121121import org.apache.spark.ml.classification.RandomForestClassificationModel
122122import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
123123import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
124- import org.apache.spark.mllib.util.MLUtils
125124
126125// Load and parse the data file, converting it to a DataFrame.
127- val data = MLUtils.loadLibSVMFile(sc, " data/mllib/sample_libsvm_data.txt").toDF( )
126+ val data = sqlContext.read.format("libsvm").load(" data/mllib/sample_libsvm_data.txt")
128127
129128// Index labels, adding metadata to the label column.
130129// Fit on whole dataset to include all labels in index.
@@ -193,14 +192,11 @@ import org.apache.spark.ml.classification.RandomForestClassifier;
193192import org.apache.spark.ml.classification.RandomForestClassificationModel;
194193import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
195194import org.apache.spark.ml.feature.* ;
196- import org.apache.spark.mllib.regression.LabeledPoint;
197- import org.apache.spark.mllib.util.MLUtils;
198- import org.apache.spark.rdd.RDD;
199195import org.apache.spark.sql.DataFrame;
200196
201197// Load and parse the data file, converting it to a DataFrame.
202- RDD< LabeledPoint > rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
203- DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class );
198+ DataFrame data = sqlContext.read.format("libsvm")
199+ .load("data/mllib/sample_libsvm_data.txt" );
204200
205201// Index labels, adding metadata to the label column.
206202// Fit on whole dataset to include all labels in index.
@@ -268,10 +264,9 @@ from pyspark.ml import Pipeline
268264from pyspark.ml.classification import RandomForestClassifier
269265from pyspark.ml.feature import StringIndexer, VectorIndexer
270266from pyspark.ml.evaluation import MulticlassClassificationEvaluator
271- from pyspark.mllib.util import MLUtils
272267
273268# Load and parse the data file, converting it to a DataFrame.
274- data = MLUtils.loadLibSVMFile(sc, " data/mllib/sample_libsvm_data.txt").toDF( )
269+ data = sqlContext.read.format("libsvm").load(" data/mllib/sample_libsvm_data.txt")
275270
276271# Index labels, adding metadata to the label column.
277272# Fit on whole dataset to include all labels in index.
@@ -327,10 +322,9 @@ import org.apache.spark.ml.regression.RandomForestRegressor
327322import org.apache.spark.ml.regression.RandomForestRegressionModel
328323import org.apache.spark.ml.feature.VectorIndexer
329324import org.apache.spark.ml.evaluation.RegressionEvaluator
330- import org.apache.spark.mllib.util.MLUtils
331325
332326// Load and parse the data file, converting it to a DataFrame.
333- val data = MLUtils.loadLibSVMFile(sc, " data/mllib/sample_libsvm_data.txt").toDF( )
327+ val data = sqlContext.read.format("libsvm").load(" data/mllib/sample_libsvm_data.txt")
334328
335329// Automatically identify categorical features, and index them.
336330// Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -387,14 +381,11 @@ import org.apache.spark.ml.feature.VectorIndexer;
387381import org.apache.spark.ml.feature.VectorIndexerModel;
388382import org.apache.spark.ml.regression.RandomForestRegressionModel;
389383import org.apache.spark.ml.regression.RandomForestRegressor;
390- import org.apache.spark.mllib.regression.LabeledPoint;
391- import org.apache.spark.mllib.util.MLUtils;
392- import org.apache.spark.rdd.RDD;
393384import org.apache.spark.sql.DataFrame;
394385
395386// Load and parse the data file, converting it to a DataFrame.
396- RDD< LabeledPoint > rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
397- DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class );
387+ DataFrame data = sqlContext.read.format("libsvm")
388+ .load("data/mllib/sample_libsvm_data.txt" );
398389
399390// Automatically identify categorical features, and index them.
400391// Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -450,10 +441,9 @@ from pyspark.ml import Pipeline
450441from pyspark.ml.regression import RandomForestRegressor
451442from pyspark.ml.feature import VectorIndexer
452443from pyspark.ml.evaluation import RegressionEvaluator
453- from pyspark.mllib.util import MLUtils
454444
455445# Load and parse the data file, converting it to a DataFrame.
456- data = MLUtils.loadLibSVMFile(sc, " data/mllib/sample_libsvm_data.txt").toDF( )
446+ data = sqlContext.read.format("libsvm").load(" data/mllib/sample_libsvm_data.txt")
457447
458448# Automatically identify categorical features, and index them.
459449# Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -576,10 +566,9 @@ import org.apache.spark.ml.classification.GBTClassifier
576566import org.apache.spark.ml.classification.GBTClassificationModel
577567import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer}
578568import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
579- import org.apache.spark.mllib.util.MLUtils
580569
581570// Load and parse the data file, converting it to a DataFrame.
582- val data = MLUtils.loadLibSVMFile(sc, " data/mllib/sample_libsvm_data.txt").toDF( )
571+ val data = sqlContext.read.format("libsvm").load(" data/mllib/sample_libsvm_data.txt")
583572
584573// Index labels, adding metadata to the label column.
585574// Fit on whole dataset to include all labels in index.
@@ -648,14 +637,10 @@ import org.apache.spark.ml.classification.GBTClassifier;
648637import org.apache.spark.ml.classification.GBTClassificationModel;
649638import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator;
650639import org.apache.spark.ml.feature.* ;
651- import org.apache.spark.mllib.regression.LabeledPoint;
652- import org.apache.spark.mllib.util.MLUtils;
653- import org.apache.spark.rdd.RDD;
654640import org.apache.spark.sql.DataFrame;
655641
656642// Load and parse the data file, converting it to a DataFrame.
657- RDD<LabeledPoint > rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
658- DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
643+ DataFrame data sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
659644
660645// Index labels, adding metadata to the label column.
661646// Fit on whole dataset to include all labels in index.
@@ -724,10 +709,9 @@ from pyspark.ml import Pipeline
724709from pyspark.ml.classification import GBTClassifier
725710from pyspark.ml.feature import StringIndexer, VectorIndexer
726711from pyspark.ml.evaluation import MulticlassClassificationEvaluator
727- from pyspark.mllib.util import MLUtils
728712
729713# Load and parse the data file, converting it to a DataFrame.
730- data = MLUtils.loadLibSVMFile(sc, " data/mllib/sample_libsvm_data.txt").toDF( )
714+ data = sqlContext.read.format("libsvm").load(" data/mllib/sample_libsvm_data.txt")
731715
732716# Index labels, adding metadata to the label column.
733717# Fit on whole dataset to include all labels in index.
@@ -783,10 +767,9 @@ import org.apache.spark.ml.regression.GBTRegressor
783767import org.apache.spark.ml.regression.GBTRegressionModel
784768import org.apache.spark.ml.feature.VectorIndexer
785769import org.apache.spark.ml.evaluation.RegressionEvaluator
786- import org.apache.spark.mllib.util.MLUtils
787770
788771// Load and parse the data file, converting it to a DataFrame.
789- val data = MLUtils.loadLibSVMFile(sc, " data/mllib/sample_libsvm_data.txt").toDF( )
772+ val data = sqlContext.read.format("libsvm").load(" data/mllib/sample_libsvm_data.txt")
790773
791774// Automatically identify categorical features, and index them.
792775// Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -844,14 +827,10 @@ import org.apache.spark.ml.feature.VectorIndexer;
844827import org.apache.spark.ml.feature.VectorIndexerModel;
845828import org.apache.spark.ml.regression.GBTRegressionModel;
846829import org.apache.spark.ml.regression.GBTRegressor;
847- import org.apache.spark.mllib.regression.LabeledPoint;
848- import org.apache.spark.mllib.util.MLUtils;
849- import org.apache.spark.rdd.RDD;
850830import org.apache.spark.sql.DataFrame;
851831
852832// Load and parse the data file, converting it to a DataFrame.
853- RDD<LabeledPoint > rdd = MLUtils.loadLibSVMFile(sc.sc(), "data/mllib/sample_libsvm_data.txt");
854- DataFrame data = jsql.createDataFrame(rdd, LabeledPoint.class);
833+ DataFrame data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt");
855834
856835// Automatically identify categorical features, and index them.
857836// Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -908,10 +887,9 @@ from pyspark.ml import Pipeline
908887from pyspark.ml.regression import GBTRegressor
909888from pyspark.ml.feature import VectorIndexer
910889from pyspark.ml.evaluation import RegressionEvaluator
911- from pyspark.mllib.util import MLUtils
912890
913891# Load and parse the data file, converting it to a DataFrame.
914- data = MLUtils.loadLibSVMFile(sc, " data/mllib/sample_libsvm_data.txt").toDF( )
892+ data = sqlContext.read.format("libsvm").load(" data/mllib/sample_libsvm_data.txt")
915893
916894# Automatically identify categorical features, and index them.
917895# Set maxCategories so features with > 4 distinct values are treated as continuous.
@@ -970,15 +948,14 @@ Refer to the [Scala API docs](api/scala/index.html#org.apache.spark.ml.classifie
970948{% highlight scala %}
971949import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest}
972950import org.apache.spark.mllib.evaluation.MulticlassMetrics
973- import org.apache.spark.mllib.util.MLUtils
974951import org.apache.spark.sql.{Row, SQLContext}
975952
976953val sqlContext = new SQLContext(sc)
977954
978955// parse data into dataframe
979- val data = MLUtils.loadLibSVMFile(sc,
980- "data/mllib/sample_multiclass_classification_data.txt")
981- val Array(train, test) = data.toDF(). randomSplit(Array(0.7, 0.3))
956+ val data = sqlContext.read.format("libsvm")
957+ .load( "data/mllib/sample_multiclass_classification_data.txt")
958+ val Array(train, test) = data.randomSplit(Array(0.7, 0.3))
982959
983960// instantiate multiclass learner and train
984961val ovr = new OneVsRest().setClassifier(new LogisticRegression)
@@ -1016,20 +993,16 @@ import org.apache.spark.ml.classification.OneVsRest;
1016993import org.apache.spark.ml.classification.OneVsRestModel;
1017994import org.apache.spark.mllib.evaluation.MulticlassMetrics;
1018995import org.apache.spark.mllib.linalg.Matrix;
1019- import org.apache.spark.mllib.regression.LabeledPoint;
1020- import org.apache.spark.mllib.util.MLUtils;
1021- import org.apache.spark.rdd.RDD;
1022996import org.apache.spark.sql.DataFrame;
1023997import org.apache.spark.sql.SQLContext;
1024998
1025999SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample");
10261000JavaSparkContext jsc = new JavaSparkContext(conf);
10271001SQLContext jsql = new SQLContext(jsc);
10281002
1029- RDD< LabeledPoint > data = MLUtils.loadLibSVMFile(jsc.sc(),
1030- "data/mllib/sample_multiclass_classification_data.txt");
1003+ DataFrame dataFrame = sqlContext.read.format("libsvm")
1004+ .load( "data/mllib/sample_multiclass_classification_data.txt");
10311005
1032- DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
10331006DataFrame[ ] splits = dataFrame.randomSplit(new double[ ] {0.7, 0.3}, 12345);
10341007DataFrame train = splits[ 0] ;
10351008DataFrame test = splits[ 1] ;
0 commit comments