diff --git a/docs/ml-features.md b/docs/ml-features.md
index 142afac2f3f9..70cbfe74e7f7 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -37,7 +37,7 @@ In the following code segment, we start with a set of sentences. We split each
Refer to the [HashingTF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and
the [IDF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.IDF) for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/TfIdfExample.scala %}
+{% include_example scala/org/apache/spark/examples/ml/HashingTF.scala %}
@@ -45,7 +45,7 @@ the [IDF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.IDF) for m
Refer to the [HashingTF Java docs](api/java/org/apache/spark/ml/feature/HashingTF.html) and the
[IDF Java docs](api/java/org/apache/spark/ml/feature/IDF.html) for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaTfIdfExample.java %}
+{% include_example java/org/apache/spark/examples/ml/JavaHashingTF.java %}
@@ -53,7 +53,24 @@ Refer to the [HashingTF Java docs](api/java/org/apache/spark/ml/feature/HashingT
Refer to the [HashingTF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF) and
the [IDF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.IDF) for more details on the API.
-{% include_example python/ml/tf_idf_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import HashingTF, IDF, Tokenizer
+
+sentenceData = sqlContext.createDataFrame([
+ (0, "Hi I heard about Spark"),
+ (0, "I wish Java could use case classes"),
+ (1, "Logistic regression models are neat")
+], ["label", "sentence"])
+tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
+wordsData = tokenizer.transform(sentenceData)
+hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
+featurizedData = hashingTF.transform(wordsData)
+idf = IDF(inputCol="rawFeatures", outputCol="features")
+idfModel = idf.fit(featurizedData)
+rescaledData = idfModel.transform(featurizedData)
+for features_label in rescaledData.select("features", "label").take(3):
+ print(features_label)
+{% endhighlight %}
@@ -74,7 +91,26 @@ In the following code segment, we start with a set of documents, each of which i
Refer to the [Word2Vec Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/Word2VecExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.Word2Vec
+
+// Input data: Each row is a bag of words from a sentence or document.
+val documentDF = sqlContext.createDataFrame(Seq(
+ "Hi I heard about Spark".split(" "),
+ "I wish Java could use case classes".split(" "),
+ "Logistic regression models are neat".split(" ")
+).map(Tuple1.apply)).toDF("text")
+
+// Learn a mapping from words to Vectors.
+val word2Vec = new Word2Vec()
+ .setInputCol("text")
+ .setOutputCol("result")
+ .setVectorSize(3)
+ .setMinCount(0)
+val model = word2Vec.fit(documentDF)
+val result = model.transform(documentDF)
+result.select("result").take(3).foreach(println)
+{% endhighlight %}
@@ -82,7 +118,43 @@ for more details on the API.
Refer to the [Word2Vec Java docs](api/java/org/apache/spark/ml/feature/Word2Vec.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaWord2VecExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+
+JavaSparkContext jsc = ...
+SQLContext sqlContext = ...
+
+// Input data: Each row is a bag of words from a sentence or document.
+JavaRDD jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
+ RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
+ RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
+));
+StructType schema = new StructType(new StructField[]{
+ new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+});
+DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
+
+// Learn a mapping from words to Vectors.
+Word2Vec word2Vec = new Word2Vec()
+ .setInputCol("text")
+ .setOutputCol("result")
+ .setVectorSize(3)
+ .setMinCount(0);
+Word2VecModel model = word2Vec.fit(documentDF);
+DataFrame result = model.transform(documentDF);
+for (Row r: result.select("result").take(3)) {
+ System.out.println(r);
+}
+{% endhighlight %}
@@ -90,7 +162,22 @@ for more details on the API.
Refer to the [Word2Vec Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec)
for more details on the API.
-{% include_example python/ml/word2vec_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import Word2Vec
+
+# Input data: Each row is a bag of words from a sentence or document.
+documentDF = sqlContext.createDataFrame([
+ ("Hi I heard about Spark".split(" "), ),
+ ("I wish Java could use case classes".split(" "), ),
+ ("Logistic regression models are neat".split(" "), )
+], ["text"])
+# Learn a mapping from words to Vectors.
+word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
+model = word2Vec.fit(documentDF)
+result = model.transform(documentDF)
+for feature in result.select("result").take(3):
+ print(feature)
+{% endhighlight %}
@@ -138,7 +225,30 @@ Refer to the [CountVectorizer Scala docs](api/scala/index.html#org.apache.spark.
and the [CountVectorizerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel)
for more details on the API.
-{% include_example scala/org/apache/spark/examples/ml/CountVectorizerExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.CountVectorizer
+import org.apache.spark.mllib.util.CountVectorizerModel
+
+val df = sqlContext.createDataFrame(Seq(
+ (0, Array("a", "b", "c")),
+ (1, Array("a", "b", "b", "c", "a"))
+)).toDF("id", "words")
+
+// fit a CountVectorizerModel from the corpus
+val cvModel: CountVectorizerModel = new CountVectorizer()
+ .setInputCol("words")
+ .setOutputCol("features")
+ .setVocabSize(3)
+ .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
+ .fit(df)
+
+// alternatively, define CountVectorizerModel with a-priori vocabulary
+val cvm = new CountVectorizerModel(Array("a", "b", "c"))
+ .setInputCol("words")
+ .setOutputCol("features")
+
+cvModel.transform(df).select("features").show()
+{% endhighlight %}
@@ -147,7 +257,40 @@ Refer to the [CountVectorizer Java docs](api/java/org/apache/spark/ml/feature/Co
and the [CountVectorizerModel Java docs](api/java/org/apache/spark/ml/feature/CountVectorizerModel.html)
for more details on the API.
-{% include_example java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java %}
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.CountVectorizer;
+import org.apache.spark.ml.feature.CountVectorizerModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+
+// Input data: Each row is a bag of words from a sentence or document.
+JavaRDD jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Arrays.asList("a", "b", "c")),
+ RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
+));
+StructType schema = new StructType(new StructField [] {
+ new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+});
+DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+
+// fit a CountVectorizerModel from the corpus
+CountVectorizerModel cvModel = new CountVectorizer()
+ .setInputCol("text")
+ .setOutputCol("feature")
+ .setVocabSize(3)
+ .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
+ .fit(df);
+
+// alternatively, define CountVectorizerModel with a-priori vocabulary
+CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
+ .setInputCol("text")
+ .setOutputCol("feature");
+
+cvModel.transform(df).show();
+{% endhighlight %}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizer.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizer.java
new file mode 100644
index 000000000000..e3e5a14f5d02
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizer.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.Binarizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * An example demonstrating a Binarizer.
+ * Run with
+ *
+ * bin/run-example ml.JavaBinarizer
+ *
+ */
+public class JavaBinarizer {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaBinarizer");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ JavaRDD jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, 0.1),
+ RowFactory.create(1, 0.8),
+ RowFactory.create(2, 0.2)
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
+ });
+ DataFrame continuousDataFrame = jsql.createDataFrame(jrdd, schema);
+ Binarizer binarizer = new Binarizer()
+ .setInputCol("feature")
+ .setOutputCol("binarized_feature")
+ .setThreshold(0.5);
+ DataFrame binarizedDataFrame = binarizer.transform(continuousDataFrame);
+ DataFrame binarizedFeatures = binarizedDataFrame.select("binarized_feature");
+ for (Row r : binarizedFeatures.collect()) {
+ Double binarized_value = r.getDouble(0);
+ System.out.println(binarized_value);
+ }
+ }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizer.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizer.java
new file mode 100644
index 000000000000..f329e2d1caf9
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizer.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.Bucketizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * An example demonstrating a Bucketizer.
+ * Run with
+ *
+ * bin/run-example ml.JavaBucketizer
+ *
+ */
+public class JavaBucketizer {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaBucketizer");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};
+
+ JavaRDD data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(-0.5),
+ RowFactory.create(-0.3),
+ RowFactory.create(0.0),
+ RowFactory.create(0.2)
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
+ });
+ DataFrame dataFrame = jsql.createDataFrame(data, schema);
+
+ Bucketizer bucketizer = new Bucketizer()
+ .setInputCol("features")
+ .setOutputCol("bucketedFeatures")
+ .setSplits(splits);
+
+ // Transform original data into its bucket index.
+ DataFrame bucketedData = bucketizer.transform(dataFrame);
+
+ }
+}
+
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDCT.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCT.java
new file mode 100644
index 000000000000..b71ef59c56e8
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDCT.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.DCT;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * An example demonstrating a discrete cosine transform.
+ * Run with
+ *
+ * bin/run-example ml.JavaDCT
+ *
+ */
+public class JavaDCT {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaDCT");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ JavaRDD data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
+ RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
+ RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+ });
+ DataFrame df = jsql.createDataFrame(data, schema);
+ DCT dct = new DCT()
+ .setInputCol("features")
+ .setOutputCol("featuresDCT")
+ .setInverse(false);
+ DataFrame dctDf = dct.transform(df);
+ dctDf.select("featuresDCT").show(3);
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProduct.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProduct.java
new file mode 100644
index 000000000000..61569a32c442
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaElementwiseProduct.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.ElementwiseProduct;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * An example demonstrating a Element wise Product.
+ * Run with
+ *
+ * bin/run-example ml.JavaElementwiseProduct
+ *
+ */
+public class JavaElementwiseProduct {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaElementwiseProduct");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ // Create some vector data; also works for sparse vectors
+ JavaRDD jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)),
+ RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0))
+ ));
+ List fields = new ArrayList(2);
+ fields.add(DataTypes.createStructField("id", DataTypes.StringType, false));
+ fields.add(DataTypes.createStructField("vector", DataTypes.StringType, false));
+ StructType schema = DataTypes.createStructType(fields);
+ DataFrame dataFrame = sqlContext.createDataFrame(jrdd, schema);
+ Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0);
+ ElementwiseProduct transformer = new ElementwiseProduct()
+ .setScalingVec(transformingVector)
+ .setInputCol("vector")
+ .setOutputCol("transformedVector");
+ // Batch transform the vectors to create new column:
+ transformer.transform(dataFrame).show();
+ }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaNGram.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGram.java
new file mode 100644
index 000000000000..a775b9ce911b
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaNGram.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.NGram;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.SQLContext;
+
+/**
+ * An example demonstrating a n-gram.
+ * Run with
+ *
+ * bin/run-example ml.JavaNGram
+ *
+ */
+public class JavaNGram {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaNGram");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ JavaRDD jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0.0, Arrays.asList("Hi", "I", "heard", "about", "Spark")),
+ RowFactory.create(1.0, Arrays.asList("I", "wish", "Java", "could", "use", "case", "classes")),
+ RowFactory.create(2.0, Arrays.asList("Logistic", "regression", "models", "are", "neat"))
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+ });
+ DataFrame wordDataFrame = sqlContext.createDataFrame(jrdd, schema);
+ NGram ngramTransformer = new NGram().setInputCol("words").setOutputCol("ngrams");
+ DataFrame ngramDataFrame = ngramTransformer.transform(wordDataFrame);
+ for (Row r : ngramDataFrame.select("ngrams", "label").take(3)) {
+ java.util.List ngrams = r.getList(0);
+ for (String ngram : ngrams) System.out.print(ngram + " --- ");
+ System.out.println();
+ }
+ }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoder.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoder.java
new file mode 100644
index 000000000000..966f7f32a198
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaOneHotEncoder.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.OneHotEncoder;
+import org.apache.spark.ml.feature.StringIndexer;
+import org.apache.spark.ml.feature.StringIndexerModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * An example demonstrating a one-hot encoding.
+ * Run with
+ *
+ * bin/run-example ml.JavaOneHotEncoder
+ *
+ */
+public class JavaOneHotEncoder {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaOneHotEncoder");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ JavaRDD jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, "a"),
+ RowFactory.create(1, "b"),
+ RowFactory.create(2, "c"),
+ RowFactory.create(3, "a"),
+ RowFactory.create(4, "a"),
+ RowFactory.create(5, "c")
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("category", DataTypes.StringType, false, Metadata.empty())
+ });
+ DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+ StringIndexerModel indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex")
+ .fit(df);
+ DataFrame indexed = indexer.transform(df);
+
+ OneHotEncoder encoder = new OneHotEncoder()
+ .setInputCol("categoryIndex")
+ .setOutputCol("categoryVec");
+ DataFrame encoded = encoder.transform(indexed);
+
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
new file mode 100644
index 000000000000..701b184c0c68
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPCAExample.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.PCA;
+import org.apache.spark.ml.feature.PCAModel;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * An example demonstrating a Principal Component Analysis(PCA).
+ * Run with
+ *
+ * bin/run-example ml.JavaPCAExample
+ *
+ */
+public class JavaPCAExample {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaPCAExample");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ JavaRDD data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})),
+ RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
+ RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+ });
+ DataFrame df = jsql.createDataFrame(data, schema);
+ PCAModel pca = new PCA()
+ .setInputCol("features")
+ .setOutputCol("pcaFeatures")
+ .setK(3)
+ .fit(df);
+ DataFrame result = pca.transform(df).select("pcaFeatures");
+ result.show();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansion.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansion.java
new file mode 100644
index 000000000000..60ff0ea20dbe
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaPolynomialExpansion.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.PolynomialExpansion;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * An example demonstrating a polynomial expansion.
+ * Run with
+ *
+ * bin/run-example ml.JavaPolynomialExpansion
+ *
+ */
+public class JavaPolynomialExpansion {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaPolynomialExpansion");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+
+ PolynomialExpansion polyExpansion = new PolynomialExpansion()
+ .setInputCol("features")
+ .setOutputCol("polyFeatures")
+ .setDegree(3);
+ JavaRDD data = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Vectors.dense(-2.0, 2.3)),
+ RowFactory.create(Vectors.dense(0.0, 0.0)),
+ RowFactory.create(Vectors.dense(0.6, -1.1))
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("features", new VectorUDT(), false, Metadata.empty()),
+ });
+ DataFrame df = jsql.createDataFrame(data, schema);
+ DataFrame polyDF = polyExpansion.transform(df);
+ Row[] row = polyDF.select("polyFeatures").take(3);
+ for (Row r : row) {
+ System.out.println(r.get(0));
+ }
+ }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormula.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormula.java
new file mode 100644
index 000000000000..f7b90cc2f248
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRFormula.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.RFormula;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+
+/**
+ * An example demonstrating a R-Formula.
+ * Run with
+ *
+ * bin/run-example ml.JavaRFormula
+ *
+ */
+public class JavaRFormula {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaRFormula");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ StructType schema = createStructType(new StructField[]{
+ createStructField("id", IntegerType, false),
+ createStructField("country", StringType, false),
+ createStructField("hour", IntegerType, false),
+ createStructField("clicked", DoubleType, false)
+ });
+
+ JavaRDD rdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(7, "US", 18, 1.0),
+ RowFactory.create(8, "CA", 12, 0.0),
+ RowFactory.create(9, "NZ", 15, 0.0)
+ ));
+
+ DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+ RFormula formula = new RFormula()
+ .setFormula("clicked ~ country + hour")
+ .setFeaturesCol("features")
+ .setLabelCol("label");
+ DataFrame output = formula.fit(dataset).transform(dataset);
+ output.select("features", "label").show();
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemover.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemover.java
new file mode 100644
index 000000000000..d31b076edc9a
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStopWordsRemover.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.StopWordsRemover;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * An example demonstrating a stop words remover.
+ * Run with
+ *
+ * bin/run-example ml.JavaStopWordsRemover
+ *
+ */
+public class JavaStopWordsRemover {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaStopWordsRemover");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ StopWordsRemover remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered");
+
+ JavaRDD rdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
+ RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty())
+ });
+ DataFrame dataset = jsql.createDataFrame(rdd, schema);
+ remover.transform(dataset).show();
+ }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexer.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexer.java
new file mode 100644
index 000000000000..81716d7b1d13
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaStringIndexer.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.StringIndexer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+
+/**
+ * An example demonstrating a string indexer.
+ * Run with
+ *
+ * bin/run-example ml.JavaStringIndexer
+ *
+ */
+public class JavaStringIndexer {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaStringIndexer");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ JavaRDD jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, "a"),
+ RowFactory.create(1, "b"),
+ RowFactory.create(2, "c"),
+ RowFactory.create(3, "a"),
+ RowFactory.create(4, "a"),
+ RowFactory.create(5, "c")
+ ));
+ StructType schema = new StructType(new StructField[]{
+ createStructField("id", DoubleType, false),
+ createStructField("category", StringType, false)
+ });
+ DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+ StringIndexer indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex");
+ DataFrame indexed = indexer.fit(df).transform(df);
+ indexed.show();
+ }
+}
\ No newline at end of file
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizer.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizer.java
new file mode 100644
index 000000000000..ce0829e76e35
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaTokenizer.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.RegexTokenizer;
+import org.apache.spark.ml.feature.Tokenizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * An example demonstrating a tokenizer.
+ * Run with
+ *
+ * bin/run-example ml.JavaTokenizer
+ *
+ */
+public class JavaTokenizer {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaTokenizer");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ JavaRDD jrdd = jsc.parallelize(Arrays.asList(
+ RowFactory.create(0, "Hi I heard about Spark"),
+ RowFactory.create(1, "I wish Java could use case classes"),
+ RowFactory.create(2, "Logistic,regression,models,are,neat")
+ ));
+ StructType schema = new StructType(new StructField[]{
+ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+ new StructField("sentence", DataTypes.StringType, false, Metadata.empty())
+ });
+ DataFrame sentenceDataFrame = sqlContext.createDataFrame(jrdd, schema);
+ Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");
+ DataFrame wordsDataFrame = tokenizer.transform(sentenceDataFrame);
+ for (Row r : wordsDataFrame.select("words", "label"). take(3)){
+ java.util.List words = r.getList(0);
+ for (String word : words) System.out.print(word + " ");
+ System.out.println();
+ }
+
+ RegexTokenizer regexTokenizer = new RegexTokenizer()
+ .setInputCol("sentence")
+ .setOutputCol("words")
+ .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
+ }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssembler.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssembler.java
new file mode 100644
index 000000000000..14f74276a012
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorAssembler.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+
+import static org.apache.spark.sql.types.DataTypes.*;
+
+/**
+ * An example demonstrating a vector assembler.
+ * Run with
+ *
+ * bin/run-example ml.JavaVectorAssembler
+ *
+ */
+public class JavaVectorAssembler {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaVectorAssembler");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext sqlContext = new SQLContext(jsc);
+
+ StructType schema = createStructType(new StructField[]{
+ createStructField("id", IntegerType, false),
+ createStructField("hour", IntegerType, false),
+ createStructField("mobile", DoubleType, false),
+ createStructField("userFeatures", new VectorUDT(), false),
+ createStructField("clicked", DoubleType, false)
+ });
+ Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
+ JavaRDD rdd = jsc.parallelize(Arrays.asList(row));
+ DataFrame dataset = sqlContext.createDataFrame(rdd, schema);
+
+ VectorAssembler assembler = new VectorAssembler()
+ .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
+ .setOutputCol("features");
+
+ DataFrame output = assembler.transform(dataset);
+ System.out.println(output.select("features", "clicked").first());
+ }
+}
+
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicer.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicer.java
new file mode 100644
index 000000000000..24d9296d8460
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSlicer.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import com.google.common.collect.Lists;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.attribute.Attribute;
+import org.apache.spark.ml.attribute.AttributeGroup;
+import org.apache.spark.ml.attribute.NumericAttribute;
+import org.apache.spark.ml.feature.VectorSlicer;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+
+/**
+ * An example demonstrating a vector slicer.
+ * Run with
+ *
+ * bin/run-example ml.JavaVectorSlicer
+ *
+ */
+public class JavaVectorSlicer {
+
+ public static void main(String[] args) {
+ SparkConf conf = new SparkConf().setAppName("JavaVectorAssembler");
+ JavaSparkContext jsc = new JavaSparkContext(conf);
+ SQLContext jsql = new SQLContext(jsc);
+
+ Attribute[] attrs = new Attribute[]{
+ NumericAttribute.defaultAttr().withName("f1"),
+ NumericAttribute.defaultAttr().withName("f2"),
+ NumericAttribute.defaultAttr().withName("f3")
+ };
+ AttributeGroup group = new AttributeGroup("userFeatures", attrs);
+
+ JavaRDD jrdd = jsc.parallelize(Lists.newArrayList(
+ RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})),
+ RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0))
+ ));
+
+ DataFrame dataset = jsql.createDataFrame(jrdd, (new StructType()).add(group.toStructField()));
+
+ VectorSlicer vectorSlicer = new VectorSlicer()
+ .setInputCol("userFeatures").setOutputCol("features");
+
+ vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"});
+ // or slicer.setIndices(new int[]{1, 2}), or slicer.setNames(new String[]{"f2", "f3"})
+
+ DataFrame output = vectorSlicer.transform(dataset);
+
+ System.out.println(output.select("userFeatures", "features").first());
+ }
+}
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
new file mode 100644
index 000000000000..4dacba9c6b59
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BinarizerExample.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.Binarizer
+import org.apache.spark.sql.{SQLContext, DataFrame}
+
+/**
+ * An example runner for binarizer. Run with
+ * {{{
+ * ./bin/run-example ml.BinarizerExample [options]
+ * }}}
+ */
+object BinarizerExample {
+
+ val conf = new SparkConf().setAppName("BinarizerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+ val data = Array(
+ (0, 0.1),
+ (1, 0.8),
+ (2, 0.2)
+ )
+ val dataFrame: DataFrame = sqlContext.createDataFrame(data).toDF("label", "feature")
+
+ val binarizer: Binarizer = new Binarizer()
+ .setInputCol("feature")
+ .setOutputCol("binarized_feature")
+ .setThreshold(0.5)
+
+ val binarizedDataFrame = binarizer.transform(dataFrame)
+ val binarizedFeatures = binarizedDataFrame.select("binarized_feature")
+ binarizedFeatures.collect().foreach(println)
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
new file mode 100644
index 000000000000..dc592c875aad
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/BucketizerExample.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.Bucketizer
+import org.apache.spark.sql.{SQLContext, DataFrame}
+
+/**
+ * An example runner for bucketizer. Run with
+ * {{{
+ * ./bin/run-example ml.BucketizerExample [options]
+ * }}}
+ */
+object BucketizerExample {
+
+ val conf = new SparkConf().setAppName("BucketizerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity)
+
+ val data = Array(-0.5, -0.3, 0.0, 0.2)
+ val dataFrame = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+
+ val bucketizer = new Bucketizer()
+ .setInputCol("features")
+ .setOutputCol("bucketedFeatures")
+ .setSplits(splits)
+
+ // Transform original data into its bucket index.
+ val bucketedData = bucketizer.transform(dataFrame)
+}
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
new file mode 100644
index 000000000000..1472cce070af
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DCTExample.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.DCT
+import org.apache.spark.mllib.linalg.Vectors
+
+/**
+ * An example runner for discrete cosine transform. Run with
+ * {{{
+ * ./bin/run-example ml.DCTExample [options]
+ * }}}
+ */
+object DCTExample {
+
+ val conf = new SparkConf().setAppName("DCTExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val data = Seq(
+ Vectors.dense(0.0, 1.0, -2.0, 3.0),
+ Vectors.dense(-1.0, 2.0, 4.0, -7.0),
+ Vectors.dense(14.0, -2.0, -5.0, 1.0))
+ val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+ val dct = new DCT()
+ .setInputCol("features")
+ .setOutputCol("featuresDCT")
+ .setInverse(false)
+ val dctDf = dct.transform(df)
+ dctDf.select("featuresDCT").show(3)
+}
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala
new file mode 100644
index 000000000000..ad5217278e39
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ElementWiseProductExample.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.ElementwiseProduct
+import org.apache.spark.mllib.linalg.Vectors
+
+/**
+ * An example runner for element wise product. Run with
+ * {{{
+ * ./bin/run-example ml.ElementWiseProductExample [options]
+ * }}}
+ */
+object ElementWiseProductExample {
+
+ val conf = new SparkConf().setAppName("OneHotEncoderExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ // Create some vector data; also works for sparse vectors
+ val dataFrame = sqlContext.createDataFrame(Seq(
+ ("a", Vectors.dense(1.0, 2.0, 3.0)),
+ ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")
+
+ val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
+ val transformer = new ElementwiseProduct()
+ .setScalingVec(transformingVector)
+ .setInputCol("vector")
+ .setOutputCol("transformedVector")
+
+ // Batch transform the vectors to create new column:
+ transformer.transform(dataFrame).show()
+}
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
new file mode 100644
index 000000000000..6334caa7c4df
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinMaxScalerExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.MinMaxScaler
+
+/**
+ * An example runner for min-max scaler. Run with
+ * {{{
+ * ./bin/run-example ml.MinMaxScalerExample [options]
+ * }}}
+ */
+object MinMaxScalerExample {
+
+ val conf = new SparkConf().setAppName("OneHotEncoderExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+
+ val dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
+ val scaler = new MinMaxScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures")
+
+ // Compute summary statistics and generate MinMaxScalerModel
+ val scalerModel = scaler.fit(dataFrame)
+
+ // rescale each feature to range [min, max].
+ val scaledData = scalerModel.transform(dataFrame)
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
new file mode 100644
index 000000000000..6e56cde93803
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NGramExample.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.NGram
+
+/**
+ * An example runner for n-gram. Run with
+ * {{{
+ * ./bin/run-example ml.NGramExample [options]
+ * }}}
+ */
+object NGramExample {
+
+ val conf = new SparkConf().setAppName("NGramExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val wordDataFrame = sqlContext.createDataFrame(Seq(
+ (0, Array("Hi", "I", "heard", "about", "Spark")),
+ (1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
+ (2, Array("Logistic", "regression", "models", "are", "neat"))
+ )).toDF("label", "words")
+
+ val ngram = new NGram().setInputCol("words").setOutputCol("ngrams")
+ val ngramDataFrame = ngram.transform(wordDataFrame)
+ ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println)
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
new file mode 100644
index 000000000000..f07c9c83c66d
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NormalizerExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.Normalizer
+
+/**
+ * An example runner for normalizer. Run with
+ * {{{
+ * ./bin/run-example ml.NormalizerExample [options]
+ * }}}
+ */
+object NormalizerExample {
+
+ val conf = new SparkConf().setAppName("NormalizerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
+
+ // Normalize each Vector using $L^1$ norm.
+ val normalizer = new Normalizer()
+ .setInputCol("features")
+ .setOutputCol("normFeatures")
+ .setP(1.0)
+ val l1NormData = normalizer.transform(dataFrame)
+
+ // Normalize each Vector using $L^\infty$ norm.
+ val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
+}
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
new file mode 100644
index 000000000000..446fb03a222b
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/OneHotEncoderExample.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer}
+
+/**
+ * An example runner for one hot encoder. Run with
+ * {{{
+ * ./bin/run-example ml.OneHotEncoderExample [options]
+ * }}}
+ */
+object OneHotEncoderExample {
+
+ val conf = new SparkConf().setAppName("OneHotEncoderExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val df = sqlContext.createDataFrame(Seq(
+ (0, "a"),
+ (1, "b"),
+ (2, "c"),
+ (3, "a"),
+ (4, "a"),
+ (5, "c")
+ )).toDF("id", "category")
+
+ val indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex")
+ .fit(df)
+ val indexed = indexer.transform(df)
+
+ val encoder = new OneHotEncoder().setInputCol("categoryIndex").
+ setOutputCol("categoryVec")
+ val encoded = encoder.transform(indexed)
+ encoded.select("id", "categoryVec").foreach(println)
+}
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
new file mode 100644
index 000000000000..c2e9a2f2057b
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PCAExample.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.PCA
+import org.apache.spark.mllib.linalg.Vectors
+
+/**
+ * An example runner for Principal Component Analysis(PCA). Run with
+ * {{{
+ * ./bin/run-example ml.PCAExample [options]
+ * }}}
+ */
+object PCAExample {
+
+ val conf = new SparkConf().setAppName("PCAExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val data = Array(
+ Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
+ Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+ Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+ )
+ val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+ val pca = new PCA()
+ .setInputCol("features")
+ .setOutputCol("pcaFeatures")
+ .setK(3)
+ .fit(df)
+ val pcaDF = pca.transform(df)
+ val result = pcaDF.select("pcaFeatures")
+ result.show()
+}
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
new file mode 100644
index 000000000000..4fa16b6ef491
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/PolynomialExpansionExample.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.PolynomialExpansion
+import org.apache.spark.mllib.linalg.Vectors
+
+/**
+ * An example runner for polynomial expansion. Run with
+ * {{{
+ * ./bin/run-example ml.PolynomialExpansionExample [options]
+ * }}}
+ */
+object PolynomialExpansionExample {
+
+ val conf = new SparkConf().setAppName("PolynomialExpansionExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val data = Array(
+ Vectors.dense(-2.0, 2.3),
+ Vectors.dense(0.0, 0.0),
+ Vectors.dense(0.6, -1.1)
+ )
+ val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+ val polynomialExpansion = new PolynomialExpansion()
+ .setInputCol("features")
+ .setOutputCol("polyFeatures")
+ .setDegree(3)
+ val polyDF = polynomialExpansion.transform(df)
+ polyDF.select("polyFeatures").take(3).foreach(println)
+}
+
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
new file mode 100644
index 000000000000..e50e2f07403e
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RFormulaExample.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.RFormula
+
+/**
+ * An example runner for R-formula. Run with
+ * {{{
+ * ./bin/run-example ml.RFormulaExample [options]
+ * }}}
+ */
+object RFormulaExample {
+
+ val conf = new SparkConf().setAppName("RFormulaExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val dataset = sqlContext.createDataFrame(Seq(
+ (7, "US", 18, 1.0),
+ (8, "CA", 12, 0.0),
+ (9, "NZ", 15, 0.0)
+ )).toDF("id", "country", "hour", "clicked")
+ val formula = new RFormula()
+ .setFormula("clicked ~ country + hour")
+ .setFeaturesCol("features")
+ .setLabelCol("label")
+ val output = formula.fit(dataset).transform(dataset)
+ output.select("features", "label").show()
+}
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala
new file mode 100644
index 000000000000..e2150001682d
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StandardScalerExample.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.StandardScaler
+
+/**
+ * An example runner for standard scaler. Run with
+ * {{{
+ * ./bin/run-example ml.StandardScalerExample [options]
+ * }}}
+ */
+object StandardScalerExample {
+
+ val conf = new SparkConf().setAppName("StandardScalerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val dataFrame = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
+ val scaler = new StandardScaler()
+ .setInputCol("features")
+ .setOutputCol("scaledFeatures")
+ .setWithStd(true)
+ .setWithMean(false)
+
+ // Compute summary statistics by fitting the StandardScaler
+ val scalerModel = scaler.fit(dataFrame)
+
+ // Normalize each feature to have unit standard deviation.
+ val scaledData = scalerModel.transform(dataFrame)
+}
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
new file mode 100644
index 000000000000..2109a5ebc146
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StopWordsRemoverExample.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.StopWordsRemover
+
+/**
+ * An example runner for stop words remover. Run with
+ * {{{
+ * ./bin/run-example ml.StopWordsRemoverExample [options]
+ * }}}
+ */
+object StopWordsRemoverExample {
+
+ val conf = new SparkConf().setAppName("StopWordsRemoverExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val remover = new StopWordsRemover()
+ .setInputCol("raw")
+ .setOutputCol("filtered")
+ val dataSet = sqlContext.createDataFrame(Seq(
+ (0, Seq("I", "saw", "the", "red", "baloon")),
+ (1, Seq("Mary", "had", "a", "little", "lamb"))
+ )).toDF("id", "raw")
+
+ remover.transform(dataSet).show()
+}
+
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
new file mode 100644
index 000000000000..e858f64d52fd
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/StringIndexerExample.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.StringIndexer
+
+/**
+ * An example runner for string indexer. Run with
+ * {{{
+ * ./bin/run-example ml.StringIndexerExample [options]
+ * }}}
+ */
+object StringIndexerExample {
+
+ val conf = new SparkConf().setAppName("StringIndexerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val df = sqlContext.createDataFrame(
+ Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
+ ).toDF("id", "category")
+ val indexer = new StringIndexer()
+ .setInputCol("category")
+ .setOutputCol("categoryIndex")
+ val indexed = indexer.fit(df).transform(df)
+ indexed.show()
+}
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
new file mode 100644
index 000000000000..b3b28791fac5
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/TokenizerExample.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer}
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.sql.SQLContext
+
+/**
+ * An example runner for tokenizer. Run with
+ * {{{
+ * ./bin/run-example ml.TokenizerExample [options]
+ * }}}
+ */
+object TokenizerExample {
+ val conf = new SparkConf().setAppName("JavaTokenizerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val sentenceDataFrame = sqlContext.createDataFrame(Seq(
+ (0, "Hi I heard about Spark"),
+ (1, "I wish Java could use case classes"),
+ (2, "Logistic,regression,models,are,neat")
+ )).toDF("label", "sentence")
+
+ val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
+ val regexTokenizer = new RegexTokenizer()
+ .setInputCol("sentence")
+ .setOutputCol("words")
+ .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false)
+
+ val tokenized = tokenizer.transform(sentenceDataFrame)
+ tokenized.select("words", "label").take(3).foreach(println)
+ val regexTokenized = regexTokenizer.transform(sentenceDataFrame)
+ regexTokenized.select("words", "label").take(3).foreach(println)
+}
+// scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
new file mode 100644
index 000000000000..bae230a2aedb
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorAssemblerExample.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.ml.feature.VectorAssembler
+
+/**
+ * An example runner for vector assembler. Run with
+ * {{{
+ * ./bin/run-example ml.VectorAssemblerExample [options]
+ * }}}
+ */
+object VectorAssemblerExample {
+
+ val conf = new SparkConf().setAppName("OneHotEncoderExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val dataset = sqlContext.createDataFrame(
+ Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
+ ).toDF("id", "hour", "mobile", "userFeatures", "clicked")
+ val assembler = new VectorAssembler()
+ .setInputCols(Array("hour", "mobile", "userFeatures"))
+ .setOutputCol("features")
+ val output = assembler.transform(dataset)
+ println(output.select("features", "clicked").first())
+}
\ No newline at end of file
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
new file mode 100644
index 000000000000..480bac0d4092
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml
+
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.ml.feature.VectorIndexer
+
+/**
+ * An example runner for vector indexer. Run with
+ * {{{
+ * ./bin/run-example ml.VectorIndexerExample [options]
+ * }}}
+ */
+object VectorIndexerExample {
+ val conf = new SparkConf().setAppName("VectorIndexerExample")
+ val sc = new SparkContext(conf)
+ val sqlContext = new SQLContext(sc)
+
+ val data = sqlContext.read.format("libsvm")
+ .load("data/mllib/sample_libsvm_data.txt")
+ val indexer = new VectorIndexer()
+ .setInputCol("features")
+ .setOutputCol("indexed")
+ .setMaxCategories(10)
+ val indexerModel = indexer.fit(data)
+ val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet
+ println(s"Chose ${categoricalFeatures.size} categorical features: " +
+ categoricalFeatures.mkString(", "))
+
+ // Create new column "indexed" with categorical values transformed to indices
+ val indexedData = indexerModel.transform(data)
+}