Skip to content

Commit e2ec32d

Browse files
HyukjinKwonNick Pentreath
authored andcommitted
[SPARK-15031][EXAMPLES][FOLLOW-UP] Make Python param example working with SparkSession
## What changes were proposed in this pull request? It seems most of Python examples were changed to use SparkSession by #12809. This PR said both examples below: - `simple_params_example.py` - `aft_survival_regression.py` are not changed because it dose not work. It seems `aft_survival_regression.py` is changed by #13050 but `simple_params_example.py` is not yet. This PR corrects the example and make this use SparkSession. In more detail, it seems `threshold` is replaced to `thresholds` here and there by 5a23213. However, when it calls `lr.fit(training, paramMap)` this overwrites the values. So, `threshold` was 5 and `thresholds` becomes 5.5 (by `1 / (1 + thresholds(0) / thresholds(1)`). According to the comment below. this is not allowed, https://github.com/apache/spark/blob/354f8f11bd4b20fa99bd67a98da3525fd3d75c81/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala#L58-L61. So, in this PR, it sets the equivalent value so that this does not throw an exception. ## How was this patch tested? Manully (`mvn package -DskipTests && spark-submit simple_params_example.py`) Author: hyukjinkwon <[email protected]> Closes #13135 from HyukjinKwon/SPARK-15031.
1 parent 661c210 commit e2ec32d

File tree

3 files changed

+13
-15
lines changed

3 files changed

+13
-15
lines changed

examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ public static void main(String[] args) {
7777
ParamMap paramMap = new ParamMap();
7878
paramMap.put(lr.maxIter().w(20)); // Specify 1 Param.
7979
paramMap.put(lr.maxIter(), 30); // This overwrites the original maxIter.
80-
double[] thresholds = {0.45, 0.55};
80+
double[] thresholds = {0.5, 0.5};
8181
paramMap.put(lr.regParam().w(0.1), lr.thresholds().w(thresholds)); // Specify multiple Params.
8282

8383
// One can also combine ParamMaps.

examples/src/main/python/ml/simple_params_example.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,10 @@
2020
import pprint
2121
import sys
2222

23-
from pyspark import SparkContext
2423
from pyspark.ml.classification import LogisticRegression
2524
from pyspark.mllib.linalg import DenseVector
2625
from pyspark.mllib.regression import LabeledPoint
27-
from pyspark.sql import SQLContext
26+
from pyspark.sql import SparkSession
2827

2928
"""
3029
A simple example demonstrating ways to specify parameters for Estimators and Transformers.
@@ -33,21 +32,20 @@
3332
"""
3433

3534
if __name__ == "__main__":
36-
if len(sys.argv) > 1:
37-
print("Usage: simple_params_example", file=sys.stderr)
38-
exit(1)
39-
sc = SparkContext(appName="PythonSimpleParamsExample")
40-
sqlContext = SQLContext(sc)
35+
spark = SparkSession \
36+
.builder \
37+
.appName("SimpleTextClassificationPipeline") \
38+
.getOrCreate()
4139

4240
# prepare training data.
4341
# We create an RDD of LabeledPoints and convert them into a DataFrame.
4442
# A LabeledPoint is an Object with two fields named label and features
4543
# and Spark SQL identifies these fields and creates the schema appropriately.
46-
training = sc.parallelize([
44+
training = spark.createDataFrame([
4745
LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])),
4846
LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])),
4947
LabeledPoint(0.0, DenseVector([2.0, 1.3, 1.0])),
50-
LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))]).toDF()
48+
LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))])
5149

5250
# Create a LogisticRegression instance with maxIter = 10.
5351
# This instance is an Estimator.
@@ -70,18 +68,18 @@
7068

7169
# We may alternatively specify parameters using a parameter map.
7270
# paramMap overrides all lr parameters set earlier.
73-
paramMap = {lr.maxIter: 20, lr.thresholds: [0.45, 0.55], lr.probabilityCol: "myProbability"}
71+
paramMap = {lr.maxIter: 20, lr.thresholds: [0.5, 0.5], lr.probabilityCol: "myProbability"}
7472

7573
# Now learn a new model using the new parameters.
7674
model2 = lr.fit(training, paramMap)
7775
print("Model 2 was fit using parameters:\n")
7876
pprint.pprint(model2.extractParamMap())
7977

8078
# prepare test data.
81-
test = sc.parallelize([
79+
test = spark.createDataFrame([
8280
LabeledPoint(1.0, DenseVector([-1.0, 1.5, 1.3])),
8381
LabeledPoint(0.0, DenseVector([3.0, 2.0, -0.1])),
84-
LabeledPoint(0.0, DenseVector([0.0, 2.2, -1.5]))]).toDF()
82+
LabeledPoint(0.0, DenseVector([0.0, 2.2, -1.5]))])
8583

8684
# Make predictions on test data using the Transformer.transform() method.
8785
# LogisticRegressionModel.transform will only use the 'features' column.
@@ -95,4 +93,4 @@
9593
print("features=%s,label=%s -> prob=%s, prediction=%s"
9694
% (row.features, row.label, row.myProbability, row.prediction))
9795

98-
sc.stop()
96+
spark.stop()

examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ object SimpleParamsExample {
7070
// which supports several methods for specifying parameters.
7171
val paramMap = ParamMap(lr.maxIter -> 20)
7272
paramMap.put(lr.maxIter, 30) // Specify 1 Param. This overwrites the original maxIter.
73-
paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.45, 0.55)) // Specify multiple Params.
73+
paramMap.put(lr.regParam -> 0.1, lr.thresholds -> Array(0.5, 0.5)) // Specify multiple Params.
7474

7575
// One can also combine ParamMaps.
7676
val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability") // Change output column name

0 commit comments

Comments
 (0)