@@ -115,22 +115,31 @@ used for evaluation and prediction.
115115
116116Note that the Python API does not yet support model save/load but will in the future.
117117
118- <!-- TODO: Make Python's example consistent with Scala's and Java's. -->
119118{% highlight python %}
120- from pyspark.mllib.regression import LabeledPoint
121119from pyspark.mllib.classification import NaiveBayes
120+ from pyspark.mllib.linalg import Vectors
121+ from pyspark.mllib.regression import LabeledPoint
122+
123+ data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")
124+
125+ # Preprocessing
126+ splitData = data.map(lambda line: line.split(','))
127+ parsedData = splitData.map(
128+ lambda parts: LabeledPoint(
129+ float(parts[ 0] ),
130+ Vectors.dense(map(lambda x: float(x), parts[ 1] .split(' ')))
131+ )
132+ )
122133
123- # an RDD of LabeledPoint
124- data = sc.parallelize([
125- LabeledPoint(0.0, [ 0.0, 0.0] )
126- ... # more labeled points
127- ] )
134+ # Split data into training (60%) and test (40%)
135+ training, test = parsedData.randomSplit([ 0.6, 0.4] , seed = 0)
128136
129137# Train a naive Bayes model.
130- model = NaiveBayes.train(data , 1.0)
138+ model = NaiveBayes.train(training , 1.0)
131139
132- # Make prediction.
133- prediction = model.predict([ 0.0, 0.0] )
140+ # Make prediction and test accuracy.
141+ predictionAndLabel = test.map(lambda p : (model.predict(p.features), p.label))
142+ accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
134143{% endhighlight %}
135144
136145</div >
0 commit comments