|
20 | 20 | import pprint |
21 | 21 | import sys |
22 | 22 |
|
23 | | -from pyspark import SparkContext |
24 | 23 | from pyspark.ml.classification import LogisticRegression |
25 | 24 | from pyspark.mllib.linalg import DenseVector |
26 | 25 | from pyspark.mllib.regression import LabeledPoint |
27 | | -from pyspark.sql import SQLContext |
| 26 | +from pyspark.sql import Row, SparkSession |
28 | 27 |
|
29 | 28 | """ |
30 | 29 | A simple example demonstrating ways to specify parameters for Estimators and Transformers. |
|
36 | 35 | if len(sys.argv) > 1: |
37 | 36 | print("Usage: simple_params_example", file=sys.stderr) |
38 | 37 | exit(1) |
39 | | - sc = SparkContext(appName="PythonSimpleParamsExample") |
40 | | - sqlContext = SQLContext(sc) |
| 38 | + spark = SparkSession \ |
| 39 | + .builder \ |
| 40 | + .appName("SimpleTextClassificationPipeline") \ |
| 41 | + .getOrCreate() |
41 | 42 |
|
42 | 43 | # prepare training data. |
43 | 44 | # We create an RDD of LabeledPoints and convert them into a DataFrame. |
44 | 45 | # A LabeledPoint is an Object with two fields named label and features |
45 | 46 | # and Spark SQL identifies these fields and creates the schema appropriately. |
46 | | - training = sc.parallelize([ |
| 47 | + training = spark.createDataFrame([ |
47 | 48 | LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])), |
48 | 49 | LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])), |
49 | 50 | LabeledPoint(0.0, DenseVector([2.0, 1.3, 1.0])), |
50 | | - LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))]).toDF() |
| 51 | + LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))]) |
51 | 52 |
|
52 | 53 | # Create a LogisticRegression instance with maxIter = 10. |
53 | 54 | # This instance is an Estimator. |
|
70 | 71 |
|
71 | 72 | # We may alternatively specify parameters using a parameter map. |
72 | 73 | # paramMap overrides all lr parameters set earlier. |
73 | | - paramMap = {lr.maxIter: 20, lr.thresholds: [0.45, 0.55], lr.probabilityCol: "myProbability"} |
| 74 | + paramMap = {lr.maxIter: 20, lr.thresholds: [0.5, 0.5], lr.probabilityCol: "myProbability"} |
74 | 75 |
|
75 | 76 | # Now learn a new model using the new parameters. |
76 | 77 | model2 = lr.fit(training, paramMap) |
77 | 78 | print("Model 2 was fit using parameters:\n") |
78 | 79 | pprint.pprint(model2.extractParamMap()) |
79 | 80 |
|
80 | 81 | # prepare test data. |
81 | | - test = sc.parallelize([ |
| 82 | + test = spark.createDataFrame([ |
82 | 83 | LabeledPoint(1.0, DenseVector([-1.0, 1.5, 1.3])), |
83 | 84 | LabeledPoint(0.0, DenseVector([3.0, 2.0, -0.1])), |
84 | | - LabeledPoint(0.0, DenseVector([0.0, 2.2, -1.5]))]).toDF() |
| 85 | + LabeledPoint(0.0, DenseVector([0.0, 2.2, -1.5]))]) |
85 | 86 |
|
86 | 87 | # Make predictions on test data using the Transformer.transform() method. |
87 | 88 | # LogisticRegressionModel.transform will only use the 'features' column. |
|
95 | 96 | print("features=%s,label=%s -> prob=%s, prediction=%s" |
96 | 97 | % (row.features, row.label, row.myProbability, row.prediction)) |
97 | 98 |
|
98 | | - sc.stop() |
| 99 | + spark.stop() |
0 commit comments