apache · somideshmukh · Nov 16, 2015 · yinxusen · Nov 16, 2015 · yinxusen
diff --git a/docs/ml-features.md b/docs/ml-features.md
@@ -37,23 +37,40 @@ In the following code segment, we start with a set of sentences.  We split each
 Refer to the [HashingTF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.HashingTF) and
 the [IDF Scala docs](api/scala/index.html#org.apache.spark.ml.feature.IDF) for more details on the API.
 
-{% include_example scala/org/apache/spark/examples/ml/TfIdfExample.scala %}
+{% include_example scala/org/apache/spark/examples/ml/HashingTF.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 
 Refer to the [HashingTF Java docs](api/java/org/apache/spark/ml/feature/HashingTF.html) and the
 [IDF Java docs](api/java/org/apache/spark/ml/feature/IDF.html) for more details on the API.
 
-{% include_example java/org/apache/spark/examples/ml/JavaTfIdfExample.java %}
+{% include_example java/org/apache/spark/examples/ml/JavaHashingTF.java %}
 </div>
 
 <div data-lang="python" markdown="1">
 
 Refer to the [HashingTF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.HashingTF) and
 the [IDF Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.IDF) for more details on the API.
 
-{% include_example python/ml/tf_idf_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import HashingTF, IDF, Tokenizer
+
+sentenceData = sqlContext.createDataFrame([
+  (0, "Hi I heard about Spark"),
+  (0, "I wish Java could use case classes"),
+  (1, "Logistic regression models are neat")
+], ["label", "sentence"])
+tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
+wordsData = tokenizer.transform(sentenceData)
+hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
+featurizedData = hashingTF.transform(wordsData)
+idf = IDF(inputCol="rawFeatures", outputCol="features")
+idfModel = idf.fit(featurizedData)
+rescaledData = idfModel.transform(featurizedData)
+for features_label in rescaledData.select("features", "label").take(3):
+  print(features_label)
+{% endhighlight %}
 </div>
 </div>
 
@@ -74,23 +91,93 @@ In the following code segment, we start with a set of documents, each of which i
 Refer to the [Word2Vec Scala docs](api/scala/index.html#org.apache.spark.ml.feature.Word2Vec)
 for more details on the API.
 
-{% include_example scala/org/apache/spark/examples/ml/Word2VecExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.Word2Vec
+
+// Input data: Each row is a bag of words from a sentence or document.
+val documentDF = sqlContext.createDataFrame(Seq(
+  "Hi I heard about Spark".split(" "),
+  "I wish Java could use case classes".split(" "),
+  "Logistic regression models are neat".split(" ")
+).map(Tuple1.apply)).toDF("text")
+
+// Learn a mapping from words to Vectors.
+val word2Vec = new Word2Vec()
+  .setInputCol("text")
+  .setOutputCol("result")
+  .setVectorSize(3)
+  .setMinCount(0)
+val model = word2Vec.fit(documentDF)
+val result = model.transform(documentDF)
+result.select("result").take(3).foreach(println)
+{% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
 
 Refer to the [Word2Vec Java docs](api/java/org/apache/spark/ml/feature/Word2Vec.html)
 for more details on the API.
 
-{% include_example java/org/apache/spark/examples/ml/JavaWord2VecExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.*;
+
+JavaSparkContext jsc = ...
+SQLContext sqlContext = ...
+
+// Input data: Each row is a bag of words from a sentence or document.
+JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+  RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))),
+  RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))),
+  RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))
+));
+StructType schema = new StructType(new StructField[]{
+  new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+});
+DataFrame documentDF = sqlContext.createDataFrame(jrdd, schema);
+
+// Learn a mapping from words to Vectors.
+Word2Vec word2Vec = new Word2Vec()
+  .setInputCol("text")
+  .setOutputCol("result")
+  .setVectorSize(3)
+  .setMinCount(0);
+Word2VecModel model = word2Vec.fit(documentDF);
+DataFrame result = model.transform(documentDF);
+for (Row r: result.select("result").take(3)) {
+  System.out.println(r);
+}
+{% endhighlight %}
 </div>
 
 <div data-lang="python" markdown="1">
 
 Refer to the [Word2Vec Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec)
 for more details on the API.
 
-{% include_example python/ml/word2vec_example.py %}
+{% highlight python %}
+from pyspark.ml.feature import Word2Vec
+
+# Input data: Each row is a bag of words from a sentence or document.
+documentDF = sqlContext.createDataFrame([
+  ("Hi I heard about Spark".split(" "), ),
+  ("I wish Java could use case classes".split(" "), ),
+  ("Logistic regression models are neat".split(" "), )
+], ["text"])
+# Learn a mapping from words to Vectors.
+word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
+model = word2Vec.fit(documentDF)
+result = model.transform(documentDF)
+for feature in result.select("result").take(3):
+  print(feature)
+{% endhighlight %}
 </div>
 </div>
 
@@ -138,7 +225,30 @@ Refer to the [CountVectorizer Scala docs](api/scala/index.html#org.apache.spark.
 and the [CountVectorizerModel Scala docs](api/scala/index.html#org.apache.spark.ml.feature.CountVectorizerModel)
 for more details on the API.
 
-{% include_example scala/org/apache/spark/examples/ml/CountVectorizerExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.feature.CountVectorizer
+import org.apache.spark.mllib.util.CountVectorizerModel
+
+val df = sqlContext.createDataFrame(Seq(
+  (0, Array("a", "b", "c")),
+  (1, Array("a", "b", "b", "c", "a"))
+)).toDF("id", "words")
+
+// fit a CountVectorizerModel from the corpus
+val cvModel: CountVectorizerModel = new CountVectorizer()
+  .setInputCol("words")
+  .setOutputCol("features")
+  .setVocabSize(3)
+  .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
+  .fit(df)
+
+// alternatively, define CountVectorizerModel with a-priori vocabulary
+val cvm = new CountVectorizerModel(Array("a", "b", "c"))
+  .setInputCol("words")
+  .setOutputCol("features")
+
+cvModel.transform(df).select("features").show()
+{% endhighlight %}
 </div>
 
 <div data-lang="java" markdown="1">
@@ -147,7 +257,40 @@ Refer to the [CountVectorizer Java docs](api/java/org/apache/spark/ml/feature/Co
 and the [CountVectorizerModel Java docs](api/java/org/apache/spark/ml/feature/CountVectorizerModel.html)
 for more details on the API.
 
-{% include_example java/org/apache/spark/examples/ml/JavaCountVectorizerExample.java %}
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.CountVectorizer;
+import org.apache.spark.ml.feature.CountVectorizerModel;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.*;
+
+// Input data: Each row is a bag of words from a sentence or document.
+JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+  RowFactory.create(Arrays.asList("a", "b", "c")),
+  RowFactory.create(Arrays.asList("a", "b", "b", "c", "a"))
+));
+StructType schema = new StructType(new StructField [] {
+  new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty())
+});
+DataFrame df = sqlContext.createDataFrame(jrdd, schema);
+
+// fit a CountVectorizerModel from the corpus
+CountVectorizerModel cvModel = new CountVectorizer()
+  .setInputCol("text")
+  .setOutputCol("feature")
+  .setVocabSize(3)
+  .setMinDF(2) // a term must appear in more or equal to 2 documents to be included in the vocabulary
+  .fit(df);
+
+// alternatively, define CountVectorizerModel with a-priori vocabulary
+CountVectorizerModel cvm = new CountVectorizerModel(new String[]{"a", "b", "c"})
+  .setInputCol("text")
+  .setOutputCol("feature");
+
+cvModel.transform(df).show();
+{% endhighlight %}
 </div>
 </div>
 

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizer.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBinarizer.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.Binarizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * An example demonstrating a Binarizer.
+ * Run with
+ * <pre>
+ * bin/run-example ml.JavaBinarizer <file> <k>
+ * </pre>
+ */
+public class JavaBinarizer {
+
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaBinarizer");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
+        RowFactory.create(0, 0.1),
+        RowFactory.create(1, 0.8),
+        RowFactory.create(2, 0.2)
+    ));
+    StructType schema = new StructType(new StructField[]{
+        new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+        new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
+    });
+    DataFrame continuousDataFrame = jsql.createDataFrame(jrdd, schema);
+    Binarizer binarizer = new Binarizer()
+      .setInputCol("feature")
+      .setOutputCol("binarized_feature")
+      .setThreshold(0.5);
+    DataFrame binarizedDataFrame = binarizer.transform(continuousDataFrame);
+    DataFrame binarizedFeatures = binarizedDataFrame.select("binarized_feature");
+    for (Row r : binarizedFeatures.collect()) {
+    Double binarized_value = r.getDouble(0);
+      System.out.println(binarized_value);
+    }
+  }
+}
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizer.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaBucketizer.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import java.util.Arrays;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.Bucketizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * An example demonstrating a Bucketizer.
+ * Run with
+ * <pre>
+ * bin/run-example ml.JavaBucketizer <file> <k>
+ * </pre>
+ */
+public class JavaBucketizer {
+
+  public static void main(String[] args) {
+    SparkConf conf = new SparkConf().setAppName("JavaBucketizer");
+    JavaSparkContext jsc = new JavaSparkContext(conf);
+    SQLContext jsql = new SQLContext(jsc);
+
+    double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};
+
+    JavaRDD<Row> data = jsc.parallelize(Arrays.asList(
+        RowFactory.create(-0.5),
+        RowFactory.create(-0.3),
+        RowFactory.create(0.0),
+        RowFactory.create(0.2)
+    ));
+    StructType schema = new StructType(new StructField[]{
+        new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
+    });
+    DataFrame dataFrame = jsql.createDataFrame(data, schema);
+
+    Bucketizer bucketizer = new Bucketizer()
+        .setInputCol("features")
+        .setOutputCol("bucketedFeatures")
+        .setSplits(splits);
+
+    // Transform original data into its bucket index.
+    DataFrame bucketedData = bucketizer.transform(dataFrame);
+
+    }
+}
+
+