2525from pyspark .mllib ._common import _convert_vector , _serialize_double_vector , \
2626 _deserialize_double_vector , _dot , _squared_distance
2727from pyspark .mllib .linalg import SparseVector
28+ from pyspark .mllib .regression import LabeledPoint
2829from pyspark .tests import PySparkTestCase
2930
3031
@@ -41,16 +42,21 @@ class VectorTests(unittest.TestCase):
4142 def test_serialize (self ):
4243 sv = SparseVector (4 , {1 : 1 , 3 : 2 })
4344 dv = array ([1. , 2. , 3. , 4. ])
45+ lst = [1 , 2 , 3 , 4 ]
4446 self .assertTrue (sv is _convert_vector (sv ))
4547 self .assertTrue (dv is _convert_vector (dv ))
48+ self .assertTrue (array_equal (dv , _convert_vector (lst )))
4649 self .assertEquals (sv ,
4750 _deserialize_double_vector (_serialize_double_vector (sv )))
4851 self .assertTrue (array_equal (dv ,
4952 _deserialize_double_vector (_serialize_double_vector (dv ))))
53+ self .assertTrue (array_equal (dv ,
54+ _deserialize_double_vector (_serialize_double_vector (lst ))))
5055
5156 def test_dot (self ):
5257 sv = SparseVector (4 , {1 : 1 , 3 : 2 })
5358 dv = array ([1. , 2. , 3. , 4. ])
59+ lst = [1 , 2 , 3 , 4 ]
5460 mat = array ([[1. , 2. , 3. , 4. ],
5561 [1. , 2. , 3. , 4. ],
5662 [1. , 2. , 3. , 4. ],
@@ -59,10 +65,109 @@ def test_dot(self):
5965 self .assertTrue (array_equal (array ([3. , 6. , 9. , 12. ]), _dot (sv , mat )))
6066 self .assertEquals (30.0 , _dot (dv , dv ))
6167 self .assertTrue (array_equal (array ([10. , 20. , 30. , 40. ]), _dot (dv , mat )))
68+ self .assertEquals (30.0 , _dot (lst , dv ))
69+ self .assertTrue (array_equal (array ([10. , 20. , 30. , 40. ]), _dot (lst , mat )))
70+
71+ def test_squared_distance (self ):
72+ sv = SparseVector (4 , {1 : 1 , 3 : 2 })
73+ dv = array ([1. , 2. , 3. , 4. ])
74+ lst = [4 , 3 , 2 , 1 ]
75+ self .assertEquals (15.0 , _squared_distance (sv , dv ))
76+ self .assertEquals (25.0 , _squared_distance (sv , lst ))
77+ self .assertEquals (20.0 , _squared_distance (dv , lst ))
78+ self .assertEquals (15.0 , _squared_distance (dv , sv ))
79+ self .assertEquals (25.0 , _squared_distance (lst , sv ))
80+ self .assertEquals (20.0 , _squared_distance (lst , dv ))
81+ self .assertEquals (0.0 , _squared_distance (sv , sv ))
82+ self .assertEquals (0.0 , _squared_distance (dv , dv ))
83+ self .assertEquals (0.0 , _squared_distance (lst , lst ))
84+
85+
86+ class ListTests (PySparkTestCase ):
87+ """
88+ Test MLlib algorithms on plain lists, to make sure they're passed through
89+ as NumPy arrays.
90+ """
91+
92+ def test_clustering (self ):
93+ from pyspark .mllib .clustering import KMeans
94+ data = [
95+ [0 , 1.1 ],
96+ [0 , 1.2 ],
97+ [1.1 , 0 ],
98+ [1.2 , 0 ],
99+ ]
100+ clusters = KMeans .train (self .sc .parallelize (data ), 2 , initializationMode = "k-means||" )
101+ self .assertEquals (clusters .predict (data [0 ]), clusters .predict (data [1 ]))
102+ self .assertEquals (clusters .predict (data [2 ]), clusters .predict (data [3 ]))
103+
104+ def test_classification (self ):
105+ from pyspark .mllib .classification import LogisticRegressionWithSGD , SVMWithSGD , NaiveBayes
106+ data = [
107+ LabeledPoint (0.0 , [1 , 0 ]),
108+ LabeledPoint (1.0 , [0 , 1 ]),
109+ LabeledPoint (0.0 , [2 , 0 ]),
110+ LabeledPoint (1.0 , [0 , 2 ])
111+ ]
112+ rdd = self .sc .parallelize (data )
113+ features = [p .features .tolist () for p in data ]
114+
115+ lr_model = LogisticRegressionWithSGD .train (rdd )
116+ self .assertTrue (lr_model .predict (features [0 ]) <= 0 )
117+ self .assertTrue (lr_model .predict (features [1 ]) > 0 )
118+ self .assertTrue (lr_model .predict (features [2 ]) <= 0 )
119+ self .assertTrue (lr_model .predict (features [3 ]) > 0 )
120+
121+ svm_model = SVMWithSGD .train (rdd )
122+ self .assertTrue (svm_model .predict (features [0 ]) <= 0 )
123+ self .assertTrue (svm_model .predict (features [1 ]) > 0 )
124+ self .assertTrue (svm_model .predict (features [2 ]) <= 0 )
125+ self .assertTrue (svm_model .predict (features [3 ]) > 0 )
126+
127+ nb_model = NaiveBayes .train (rdd )
128+ self .assertTrue (nb_model .predict (features [0 ]) <= 0 )
129+ self .assertTrue (nb_model .predict (features [1 ]) > 0 )
130+ self .assertTrue (nb_model .predict (features [2 ]) <= 0 )
131+ self .assertTrue (nb_model .predict (features [3 ]) > 0 )
132+
133+ def test_regression (self ):
134+ from pyspark .mllib .regression import LinearRegressionWithSGD , LassoWithSGD , \
135+ RidgeRegressionWithSGD
136+ data = [
137+ LabeledPoint (- 1.0 , [0 , - 1 ]),
138+ LabeledPoint (1.0 , [0 , 1 ]),
139+ LabeledPoint (- 1.0 , [0 , - 2 ]),
140+ LabeledPoint (1.0 , [0 , 2 ])
141+ ]
142+ rdd = self .sc .parallelize (data )
143+ features = [p .features .tolist () for p in data ]
144+
145+ lr_model = LinearRegressionWithSGD .train (rdd )
146+ self .assertTrue (lr_model .predict (features [0 ]) <= 0 )
147+ self .assertTrue (lr_model .predict (features [1 ]) > 0 )
148+ self .assertTrue (lr_model .predict (features [2 ]) <= 0 )
149+ self .assertTrue (lr_model .predict (features [3 ]) > 0 )
150+
151+ lasso_model = LassoWithSGD .train (rdd )
152+ self .assertTrue (lasso_model .predict (features [0 ]) <= 0 )
153+ self .assertTrue (lasso_model .predict (features [1 ]) > 0 )
154+ self .assertTrue (lasso_model .predict (features [2 ]) <= 0 )
155+ self .assertTrue (lasso_model .predict (features [3 ]) > 0 )
156+
157+ rr_model = RidgeRegressionWithSGD .train (rdd )
158+ self .assertTrue (rr_model .predict (features [0 ]) <= 0 )
159+ self .assertTrue (rr_model .predict (features [1 ]) > 0 )
160+ self .assertTrue (rr_model .predict (features [2 ]) <= 0 )
161+ self .assertTrue (rr_model .predict (features [3 ]) > 0 )
62162
63163
64164@unittest .skipIf (not _have_scipy , "SciPy not installed" )
65165class SciPyTests (PySparkTestCase ):
166+ """
167+ Test both vector operations and MLlib algorithms with SciPy sparse matrices,
168+ if SciPy is available.
169+ """
170+
66171 def test_serialize (self ):
67172 from scipy .sparse import lil_matrix
68173 lil = lil_matrix ((4 , 1 ))
@@ -132,63 +237,61 @@ def test_clustering(self):
132237 def test_classification (self ):
133238 from pyspark .mllib .classification import LogisticRegressionWithSGD , SVMWithSGD , NaiveBayes
134239 data = [
135- self .scipy_matrix (3 , {0 : 0.0 , 2 : 0.0 } ),
136- self .scipy_matrix (3 , {0 : 1.0 , 2 : 1.0 }),
137- self .scipy_matrix (3 , {0 : 0.0 , 2 : 0 .0 }),
138- self .scipy_matrix (3 , {0 : 1.0 , 2 : 2.0 })
240+ LabeledPoint ( 0.0 , self .scipy_matrix (2 , {0 : 1.0 }) ),
241+ LabeledPoint ( 1.0 , self .scipy_matrix (2 , {1 : 1.0 }) ),
242+ LabeledPoint ( 0.0 , self .scipy_matrix (2 , {0 : 2 .0 }) ),
243+ LabeledPoint ( 1.0 , self .scipy_matrix (2 , {1 : 2.0 }) )
139244 ]
140245 rdd = self .sc .parallelize (data )
141- features = [
142- self .scipy_matrix (2 , {1 : 0.0 }),
143- self .scipy_matrix (2 , {1 : 1.0 }),
144- self .scipy_matrix (2 , {1 : 2.0 }),
145- ]
246+ features = [p .features for p in data ]
146247
147248 lr_model = LogisticRegressionWithSGD .train (rdd )
148249 self .assertTrue (lr_model .predict (features [0 ]) <= 0 )
149250 self .assertTrue (lr_model .predict (features [1 ]) > 0 )
150- self .assertTrue (lr_model .predict (features [2 ]) > 0 )
251+ self .assertTrue (lr_model .predict (features [2 ]) <= 0 )
252+ self .assertTrue (lr_model .predict (features [3 ]) > 0 )
151253
152254 svm_model = SVMWithSGD .train (rdd )
153255 self .assertTrue (svm_model .predict (features [0 ]) <= 0 )
154256 self .assertTrue (svm_model .predict (features [1 ]) > 0 )
155- self .assertTrue (svm_model .predict (features [2 ]) > 0 )
257+ self .assertTrue (svm_model .predict (features [2 ]) <= 0 )
258+ self .assertTrue (svm_model .predict (features [3 ]) > 0 )
156259
157260 nb_model = NaiveBayes .train (rdd )
158261 self .assertTrue (nb_model .predict (features [0 ]) <= 0 )
159262 self .assertTrue (nb_model .predict (features [1 ]) > 0 )
160- self .assertTrue (nb_model .predict (features [2 ]) > 0 )
263+ self .assertTrue (nb_model .predict (features [2 ]) <= 0 )
264+ self .assertTrue (nb_model .predict (features [3 ]) > 0 )
161265
162266 def test_regression (self ):
163267 from pyspark .mllib .regression import LinearRegressionWithSGD , LassoWithSGD , \
164268 RidgeRegressionWithSGD
165269 data = [
166- self . scipy_matrix ( 3 , { 0 : - 1.0 , 2 : - 1.0 }),
167- self .scipy_matrix (3 , {0 : 1.0 , 2 : 1.0 }),
168- self . scipy_matrix ( 3 , { 0 : - 1.0 , 2 : - 2.0 }),
169- self .scipy_matrix (3 , {0 : 1.0 , 2 : 2.0 })
270+ LabeledPoint ( - 1.0 , self . scipy_matrix ( 2 , { 1 : - 1.0 }) ),
271+ LabeledPoint ( 1.0 , self .scipy_matrix (2 , {1 : 1.0 }) ),
272+ LabeledPoint ( - 1.0 , self . scipy_matrix ( 2 , { 1 : - 2.0 }) ),
273+ LabeledPoint ( 1.0 , self .scipy_matrix (2 , {1 : 2.0 }) )
170274 ]
171275 rdd = self .sc .parallelize (data )
172- features = [
173- self .scipy_matrix (2 , {1 : - 1.0 }),
174- self .scipy_matrix (2 , {1 : 1.0 }),
175- self .scipy_matrix (2 , {1 : 2.0 }),
176- ]
276+ features = [p .features for p in data ]
177277
178278 lr_model = LinearRegressionWithSGD .train (rdd )
179279 self .assertTrue (lr_model .predict (features [0 ]) <= 0 )
180280 self .assertTrue (lr_model .predict (features [1 ]) > 0 )
181- self .assertTrue (lr_model .predict (features [2 ]) > 0 )
281+ self .assertTrue (lr_model .predict (features [2 ]) <= 0 )
282+ self .assertTrue (lr_model .predict (features [3 ]) > 0 )
182283
183284 lasso_model = LassoWithSGD .train (rdd )
184285 self .assertTrue (lasso_model .predict (features [0 ]) <= 0 )
185286 self .assertTrue (lasso_model .predict (features [1 ]) > 0 )
186- self .assertTrue (lasso_model .predict (features [2 ]) > 0 )
287+ self .assertTrue (lasso_model .predict (features [2 ]) <= 0 )
288+ self .assertTrue (lasso_model .predict (features [3 ]) > 0 )
187289
188290 rr_model = RidgeRegressionWithSGD .train (rdd )
189291 self .assertTrue (rr_model .predict (features [0 ]) <= 0 )
190292 self .assertTrue (rr_model .predict (features [1 ]) > 0 )
191- self .assertTrue (rr_model .predict (features [2 ]) > 0 )
293+ self .assertTrue (rr_model .predict (features [2 ]) <= 0 )
294+ self .assertTrue (rr_model .predict (features [3 ]) > 0 )
192295
193296
194297if __name__ == "__main__" :
0 commit comments