Skip to content

Commit 12e40a0

Browse files
committed
[SPARK-11381][DOCS] Replace example code in mllib-linear-methods.md using include_example
This PR replaces example codes in mllib-linear-methods.md using `include_example` by doing the followings: * Extracts the example codes(Scala,Java,Python) as files in `example` module. * Merges some dialog-style examples into a single file. * Hide redundant codes in HTML for the consistency with other docs. * Move the output directory into 'target/tmp'.
1 parent 230bbea commit 12e40a0

14 files changed

+907
-428
lines changed

docs/mllib-linear-methods.md

Lines changed: 19 additions & 428 deletions
Large diffs are not rendered by default.
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.mllib;
19+
20+
import scala.Tuple2;
21+
22+
import org.apache.spark.api.java.*;
23+
import org.apache.spark.api.java.function.Function;
24+
import org.apache.spark.mllib.classification.*;
25+
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
26+
// $example on$
27+
import org.apache.spark.mllib.optimization.L1Updater;
28+
// $example off$
29+
import org.apache.spark.mllib.regression.LabeledPoint;
30+
import org.apache.spark.mllib.util.MLUtils;
31+
import org.apache.spark.SparkConf;
32+
import org.apache.spark.SparkContext;
33+
34+
public class JavaL1UpdaterExample {
35+
public static void main(String[] args) {
36+
SparkConf conf = new SparkConf().setAppName("JavaL1UpdaterExample");
37+
SparkContext sc = new SparkContext(conf);
38+
39+
String path = "data/mllib/sample_libsvm_data.txt";
40+
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
41+
42+
// Split initial RDD into two... [60% training data, 40% testing data].
43+
JavaRDD<LabeledPoint> training = data.sample(false, 0.6, 11L);
44+
training.cache();
45+
JavaRDD<LabeledPoint> test = data.subtract(training);
46+
47+
// Run training algorithm to build the model.
48+
int numIterations = 100;
49+
// $example on$
50+
SVMWithSGD svmAlg = new SVMWithSGD();
51+
svmAlg.optimizer()
52+
.setNumIterations(200)
53+
.setRegParam(0.1)
54+
.setUpdater(new L1Updater());
55+
final SVMModel modelL1 = svmAlg.run(training.rdd());
56+
// $example off$
57+
58+
// Clear the default threshold.
59+
modelL1.clearThreshold();
60+
61+
// Compute raw scores on the test set.
62+
JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map(
63+
new Function<LabeledPoint, Tuple2<Object, Object>>() {
64+
public Tuple2<Object, Object> call(LabeledPoint p) {
65+
Double score = modelL1.predict(p.features());
66+
return new Tuple2<Object, Object>(score, p.label());
67+
}
68+
}
69+
);
70+
71+
// Get evaluation metrics.
72+
BinaryClassificationMetrics metrics =
73+
new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels));
74+
double auROC = metrics.areaUnderROC();
75+
76+
System.out.println("Area under ROC = " + auROC);
77+
78+
// Save and load model
79+
modelL1.save(sc, "target/tmp/myL1UpdaterModel");
80+
SVMModel sameModel = SVMModel.load(sc, "target/tmp/myL1UpdaterModel");
81+
82+
sc.stop();
83+
}
84+
}
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.mllib;
19+
20+
// $example on$
21+
import scala.Tuple2;
22+
23+
import org.apache.spark.api.java.*;
24+
import org.apache.spark.api.java.function.Function;
25+
import org.apache.spark.mllib.linalg.Vector;
26+
import org.apache.spark.mllib.linalg.Vectors;
27+
import org.apache.spark.mllib.regression.LabeledPoint;
28+
import org.apache.spark.mllib.regression.LinearRegressionModel;
29+
import org.apache.spark.mllib.regression.LinearRegressionWithSGD;
30+
// $example off$
31+
32+
import org.apache.spark.SparkConf;
33+
import org.apache.spark.SparkContext;
34+
35+
/**
36+
* Example for LinearRegressionWithSGD.
37+
*/
38+
public class JavaLinearRegressionWithSGDExample {
39+
public static void main(String[] args) {
40+
SparkConf conf = new SparkConf().setAppName("JavaLinearRegressionWithSGDExample");
41+
JavaSparkContext sc = new JavaSparkContext(conf);
42+
43+
// $example on$
44+
// Load and parse the data
45+
String path = "data/mllib/ridge-data/lpsa.data";
46+
JavaRDD<String> data = sc.textFile(path);
47+
JavaRDD<LabeledPoint> parsedData = data.map(
48+
new Function<String, LabeledPoint>() {
49+
public LabeledPoint call(String line) {
50+
String[] parts = line.split(",");
51+
String[] features = parts[1].split(" ");
52+
double[] v = new double[features.length];
53+
for (int i = 0; i < features.length - 1; i++) {
54+
v[i] = Double.parseDouble(features[i]);
55+
}
56+
return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
57+
}
58+
}
59+
);
60+
parsedData.cache();
61+
62+
// Building the model
63+
int numIterations = 100;
64+
double stepSize = 0.00000001;
65+
final LinearRegressionModel model =
66+
LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations, stepSize);
67+
68+
// Evaluate model on training examples and compute training error
69+
JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map(
70+
new Function<LabeledPoint, Tuple2<Double, Double>>() {
71+
public Tuple2<Double, Double> call(LabeledPoint point) {
72+
double prediction = model.predict(point.features());
73+
return new Tuple2<Double, Double>(prediction, point.label());
74+
}
75+
}
76+
);
77+
double MSE = new JavaDoubleRDD(valuesAndPreds.map(
78+
new Function<Tuple2<Double, Double>, Object>() {
79+
public Object call(Tuple2<Double, Double> pair) {
80+
return Math.pow(pair._1() - pair._2(), 2.0);
81+
}
82+
}
83+
).rdd()).mean();
84+
System.out.println("training Mean Squared Error = " + MSE);
85+
86+
// Save and load model
87+
model.save(sc.sc(), "target/tmp/myLinearRegressionWithSGDModel");
88+
LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "target/tmp/myLinearRegressionWithSGDModel");
89+
// $example off$
90+
91+
sc.stop();
92+
}
93+
}
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.mllib;
19+
20+
// $example on$
21+
import scala.Tuple2;
22+
23+
import org.apache.spark.api.java.*;
24+
import org.apache.spark.api.java.function.Function;
25+
import org.apache.spark.mllib.classification.LogisticRegressionModel;
26+
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS;
27+
import org.apache.spark.mllib.evaluation.MulticlassMetrics;
28+
import org.apache.spark.mllib.regression.LabeledPoint;
29+
import org.apache.spark.mllib.util.MLUtils;
30+
// $example off$
31+
32+
import org.apache.spark.SparkConf;
33+
import org.apache.spark.SparkContext;
34+
35+
/**
36+
* Example for LogisticRegressionWithLBFGS.
37+
*/
38+
public class JavaLogisticRegressionWithLBFGSExample {
39+
public static void main(String[] args) {
40+
SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample");
41+
SparkContext sc = new SparkContext(conf);
42+
// $example on$
43+
String path = "data/mllib/sample_libsvm_data.txt";
44+
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
45+
46+
// Split initial RDD into two... [60% training data, 40% testing data].
47+
JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
48+
JavaRDD<LabeledPoint> training = splits[0].cache();
49+
JavaRDD<LabeledPoint> test = splits[1];
50+
51+
// Run training algorithm to build the model.
52+
final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
53+
.setNumClasses(10)
54+
.run(training.rdd());
55+
56+
// Compute raw scores on the test set.
57+
JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
58+
new Function<LabeledPoint, Tuple2<Object, Object>>() {
59+
public Tuple2<Object, Object> call(LabeledPoint p) {
60+
Double prediction = model.predict(p.features());
61+
return new Tuple2<Object, Object>(prediction, p.label());
62+
}
63+
}
64+
);
65+
66+
// Get evaluation metrics.
67+
MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
68+
double precision = metrics.precision();
69+
System.out.println("Precision = " + precision);
70+
71+
// Save and load model
72+
model.save(sc, "target/tmp/myLogisticRegressionWithLBFGSModel");
73+
LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
74+
"target/tmp/myLogisticRegressionWithLBFGSModel");
75+
// $example off$
76+
77+
sc.stop();
78+
}
79+
}
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.mllib;
19+
20+
// $example on$
21+
import scala.Tuple2;
22+
23+
import org.apache.spark.api.java.*;
24+
import org.apache.spark.api.java.function.Function;
25+
import org.apache.spark.mllib.classification.*;
26+
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics;
27+
import org.apache.spark.mllib.regression.LabeledPoint;
28+
import org.apache.spark.mllib.util.MLUtils;
29+
// $example off$
30+
import org.apache.spark.SparkConf;
31+
import org.apache.spark.SparkContext;
32+
33+
/**
34+
* Example for SVMWithSGD.
35+
*/
36+
public class JavaSVMWithSGDExample {
37+
public static void main(String[] args) {
38+
SparkConf conf = new SparkConf().setAppName("JavaSVMWithSGDExample");
39+
SparkContext sc = new SparkContext(conf);
40+
// $example on$
41+
String path = "data/mllib/sample_libsvm_data.txt";
42+
JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();
43+
44+
// Split initial RDD into two... [60% training data, 40% testing data].
45+
JavaRDD<LabeledPoint> training = data.sample(false, 0.6, 11L);
46+
training.cache();
47+
JavaRDD<LabeledPoint> test = data.subtract(training);
48+
49+
// Run training algorithm to build the model.
50+
int numIterations = 100;
51+
final SVMModel model = SVMWithSGD.train(training.rdd(), numIterations);
52+
53+
// Clear the default threshold.
54+
model.clearThreshold();
55+
56+
// Compute raw scores on the test set.
57+
JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map(
58+
new Function<LabeledPoint, Tuple2<Object, Object>>() {
59+
public Tuple2<Object, Object> call(LabeledPoint p) {
60+
Double score = model.predict(p.features());
61+
return new Tuple2<Object, Object>(score, p.label());
62+
}
63+
}
64+
);
65+
66+
// Get evaluation metrics.
67+
BinaryClassificationMetrics metrics =
68+
new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels));
69+
double auROC = metrics.areaUnderROC();
70+
71+
System.out.println("Area under ROC = " + auROC);
72+
73+
// Save and load model
74+
model.save(sc, "target/tmp/mySVMWithSGDModel");
75+
SVMModel sameModel = SVMModel.load(sc, "target/tmp/mySVMWithSGDModel");
76+
// $example off$
77+
78+
sc.stop();
79+
}
80+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
"""
19+
Linear Regression With SGD Example.
20+
"""
21+
from __future__ import print_function
22+
23+
from pyspark import SparkContext
24+
# $example on$
25+
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
26+
# $example off$
27+
28+
if __name__ == "__main__":
29+
30+
sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")
31+
32+
# $example on$
33+
# Load and parse the data
34+
def parsePoint(line):
35+
values = [float(x) for x in line.replace(',', ' ').split(' ')]
36+
return LabeledPoint(values[0], values[1:])
37+
38+
data = sc.textFile("data/mllib/ridge-data/lpsa.data")
39+
parsedData = data.map(parsePoint)
40+
41+
# Build the model
42+
model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001)
43+
44+
# Evaluate the model on training data
45+
valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
46+
MSE = valuesAndPreds \
47+
.map(lambda (v, p): (v - p)**2) \
48+
.reduce(lambda x, y: x + y) / valuesAndPreds.count()
49+
print("Mean Squared Error = " + str(MSE))
50+
51+
# Save and load model
52+
model.save(sc, "target/tmp/myLinearRegressionWithSGDModel")
53+
sameModel = LinearRegressionModel.load(sc, "target/tmp/myLinearRegressionWithSGDModel")
54+
# $example off$

0 commit comments

Comments
 (0)