remove changes in examples

yinxusen · yinxusen · commit 85579771df81 · 2015-10-23T00:51:44.000-07:00
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
@@ -434,15 +434,182 @@ This example follows the simple text document `Pipeline` illustrated in the figu
 <div class="codetabs">
 
 <div data-lang="scala">
-{% include_example scala/org/apache/spark/examples/ml/CrossValidatorExample.scala %}
+{% highlight scala %}
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.ml.classification.LogisticRegression
+import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.sql.Row
+
+// Prepare training documents from a list of (id, text, label) tuples.
+val training = sqlContext.createDataFrame(Seq(
+  (0L, "a b c d e spark", 1.0),
+  (1L, "b d", 0.0),
+  (2L, "spark f g h", 1.0),
+  (3L, "hadoop mapreduce", 0.0)
+)).toDF("id", "text", "label")
+
+// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
+val tokenizer = new Tokenizer()
+  .setInputCol("text")
+  .setOutputCol("words")
+val hashingTF = new HashingTF()
+  .setNumFeatures(1000)
+  .setInputCol(tokenizer.getOutputCol)
+  .setOutputCol("features")
+val lr = new LogisticRegression()
+  .setMaxIter(10)
+  .setRegParam(0.01)
+val pipeline = new Pipeline()
+  .setStages(Array(tokenizer, hashingTF, lr))
+
+// Fit the pipeline to training documents.
+val model = pipeline.fit(training)
+
+// Prepare test documents, which are unlabeled (id, text) tuples.
+val test = sqlContext.createDataFrame(Seq(
+  (4L, "spark i j k"),
+  (5L, "l m n"),
+  (6L, "mapreduce spark"),
+  (7L, "apache hadoop")
+)).toDF("id", "text")
+
+// Make predictions on test documents.
+model.transform(test)
+  .select("id", "text", "probability", "prediction")
+  .collect()
+  .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
+    println(s"($id, $text) --> prob=$prob, prediction=$prediction")
+  }
+
+{% endhighlight %}
 </div>
 
 <div data-lang="java">
-{% include_example java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java %}
+{% highlight java %}
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.ml.Pipeline;
+import org.apache.spark.ml.PipelineModel;
+import org.apache.spark.ml.PipelineStage;
+import org.apache.spark.ml.classification.LogisticRegression;
+import org.apache.spark.ml.feature.HashingTF;
+import org.apache.spark.ml.feature.Tokenizer;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+
+// Labeled and unlabeled instance types.
+// Spark SQL can infer schema from Java Beans.
+public class Document implements Serializable {
+  private long id;
+  private String text;
+
+  public Document(long id, String text) {
+    this.id = id;
+    this.text = text;
+  }
+
+  public long getId() { return this.id; }
+  public void setId(long id) { this.id = id; }
+
+  public String getText() { return this.text; }
+  public void setText(String text) { this.text = text; }
+}
+
+public class LabeledDocument extends Document implements Serializable {
+  private double label;
+
+  public LabeledDocument(long id, String text, double label) {
+    super(id, text);
+    this.label = label;
+  }
+
+  public double getLabel() { return this.label; }
+  public void setLabel(double label) { this.label = label; }
+}
+
+// Prepare training documents, which are labeled.
+DataFrame training = sqlContext.createDataFrame(Arrays.asList(
+  new LabeledDocument(0L, "a b c d e spark", 1.0),
+  new LabeledDocument(1L, "b d", 0.0),
+  new LabeledDocument(2L, "spark f g h", 1.0),
+  new LabeledDocument(3L, "hadoop mapreduce", 0.0)
+), LabeledDocument.class);
+
+// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
+Tokenizer tokenizer = new Tokenizer()
+  .setInputCol("text")
+  .setOutputCol("words");
+HashingTF hashingTF = new HashingTF()
+  .setNumFeatures(1000)
+  .setInputCol(tokenizer.getOutputCol())
+  .setOutputCol("features");
+LogisticRegression lr = new LogisticRegression()
+  .setMaxIter(10)
+  .setRegParam(0.01);
+Pipeline pipeline = new Pipeline()
+  .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});
+
+// Fit the pipeline to training documents.
+PipelineModel model = pipeline.fit(training);
+
+// Prepare test documents, which are unlabeled.
+DataFrame test = sqlContext.createDataFrame(Arrays.asList(
+  new Document(4L, "spark i j k"),
+  new Document(5L, "l m n"),
+  new Document(6L, "mapreduce spark"),
+  new Document(7L, "apache hadoop")
+), Document.class);
+
+// Make predictions on test documents.
+DataFrame predictions = model.transform(test);
+for (Row r: predictions.select("id", "text", "probability", "prediction").collect()) {
+  System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
+      + ", prediction=" + r.get(3));
+}
+
+{% endhighlight %}
 </div>
 
 <div data-lang="python">
-{% include_example python/ml/cross_validator.py %}
+{% highlight python %}
+from pyspark.ml import Pipeline
+from pyspark.ml.classification import LogisticRegression
+from pyspark.ml.feature import HashingTF, Tokenizer
+from pyspark.sql import Row
+
+# Prepare training documents from a list of (id, text, label) tuples.
+LabeledDocument = Row("id", "text", "label")
+training = sqlContext.createDataFrame([
+    (0L, "a b c d e spark", 1.0),
+    (1L, "b d", 0.0),
+    (2L, "spark f g h", 1.0),
+    (3L, "hadoop mapreduce", 0.0)], ["id", "text", "label"])
+
+# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
+tokenizer = Tokenizer(inputCol="text", outputCol="words")
+hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
+lr = LogisticRegression(maxIter=10, regParam=0.01)
+pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
+
+# Fit the pipeline to training documents.
+model = pipeline.fit(training)
+
+# Prepare test documents, which are unlabeled (id, text) tuples.
+test = sqlContext.createDataFrame([
+    (4L, "spark i j k"),
+    (5L, "l m n"),
+    (6L, "mapreduce spark"),
+    (7L, "apache hadoop")], ["id", "text"])
+
+# Make predictions on test documents and print columns of interest.
+prediction = model.transform(test)
+selected = prediction.select("id", "text", "prediction")
+for row in selected.collect():
+    print(row)
+
+{% endhighlight %}
 </div>
 
 </div>
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
@@ -17,7 +17,6 @@
 
 package org.apache.spark.examples.ml;
 
-// $example on$
 import java.util.List;
 
 import com.google.common.collect.Lists;
@@ -37,7 +36,6 @@
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SQLContext;
-// $example off$
 
 /**
  * A simple example demonstrating model selection using CrossValidator.
@@ -59,7 +57,6 @@ public static void main(String[] args) {
     JavaSparkContext jsc = new JavaSparkContext(conf);
     SQLContext jsql = new SQLContext(jsc);
 
-    // $example on$
     // Prepare training documents, which are labeled.
     List<LabeledDocument> localTraining = Lists.newArrayList(
       new LabeledDocument(0L, "a b c d e spark", 1.0),
@@ -123,7 +120,6 @@ public static void main(String[] args) {
       System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
           + ", prediction=" + r.get(3));
     }
-    // $example off$
 
     jsc.stop();
   }
diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py
@@ -17,15 +17,13 @@
 
 from __future__ import print_function
 
-# $example on$
 from pyspark import SparkContext
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import LogisticRegression
 from pyspark.ml.evaluation import BinaryClassificationEvaluator
 from pyspark.ml.feature import HashingTF, Tokenizer
 from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
 from pyspark.sql import Row, SQLContext
-# $example off$
 
 """
 A simple example demonstrating model selection using CrossValidator.
@@ -39,7 +37,6 @@
     sc = SparkContext(appName="CrossValidatorExample")
     sqlContext = SQLContext(sc)
 
-    # $example on$
     # Prepare training documents, which are labeled.
     LabeledDocument = Row("id", "text", "label")
     training = sc.parallelize([(0, "a b c d e spark", 1.0),
@@ -95,6 +92,5 @@
     selected = prediction.select("id", "text", "probability", "prediction")
     for row in selected.collect():
         print(row)
-    # $example off$
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
@@ -18,7 +18,6 @@
 // scalastyle:off println
 package org.apache.spark.examples.ml
 
-// $example on$
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.classification.LogisticRegression
@@ -27,7 +26,6 @@ import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
 import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.sql.{Row, SQLContext}
-// $example off$
 
 /**
  * A simple example demonstrating model selection using CrossValidator.
@@ -49,7 +47,6 @@ object CrossValidatorExample {
     val sqlContext = new SQLContext(sc)
     import sqlContext.implicits._
 
-    // $example on$
     // Prepare training documents, which are labeled.
     val training = sc.parallelize(Seq(
       LabeledDocument(0L, "a b c d e spark", 1.0),
@@ -110,7 +107,6 @@ object CrossValidatorExample {
       .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
       println(s"($id, $text) --> prob=$prob, prediction=$prediction")
     }
-    // $example off$
 
     sc.stop()
   }