Skip to content

Commit c48e85a

Browse files
committed
Added some tests for passing lists as input, and added mllib/tests.py to
run-tests script.
1 parent a07ba10 commit c48e85a

File tree

2 files changed

+128
-24
lines changed

2 files changed

+128
-24
lines changed

python/pyspark/mllib/tests.py

Lines changed: 127 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from pyspark.mllib._common import _convert_vector, _serialize_double_vector, \
2626
_deserialize_double_vector, _dot, _squared_distance
2727
from pyspark.mllib.linalg import SparseVector
28+
from pyspark.mllib.regression import LabeledPoint
2829
from pyspark.tests import PySparkTestCase
2930

3031

@@ -41,16 +42,21 @@ class VectorTests(unittest.TestCase):
4142
def test_serialize(self):
4243
sv = SparseVector(4, {1: 1, 3: 2})
4344
dv = array([1., 2., 3., 4.])
45+
lst = [1, 2, 3, 4]
4446
self.assertTrue(sv is _convert_vector(sv))
4547
self.assertTrue(dv is _convert_vector(dv))
48+
self.assertTrue(array_equal(dv, _convert_vector(lst)))
4649
self.assertEquals(sv,
4750
_deserialize_double_vector(_serialize_double_vector(sv)))
4851
self.assertTrue(array_equal(dv,
4952
_deserialize_double_vector(_serialize_double_vector(dv))))
53+
self.assertTrue(array_equal(dv,
54+
_deserialize_double_vector(_serialize_double_vector(lst))))
5055

5156
def test_dot(self):
5257
sv = SparseVector(4, {1: 1, 3: 2})
5358
dv = array([1., 2., 3., 4.])
59+
lst = [1, 2, 3, 4]
5460
mat = array([[1., 2., 3., 4.],
5561
[1., 2., 3., 4.],
5662
[1., 2., 3., 4.],
@@ -59,10 +65,109 @@ def test_dot(self):
5965
self.assertTrue(array_equal(array([3., 6., 9., 12.]), _dot(sv, mat)))
6066
self.assertEquals(30.0, _dot(dv, dv))
6167
self.assertTrue(array_equal(array([10., 20., 30., 40.]), _dot(dv, mat)))
68+
self.assertEquals(30.0, _dot(lst, dv))
69+
self.assertTrue(array_equal(array([10., 20., 30., 40.]), _dot(lst, mat)))
70+
71+
def test_squared_distance(self):
72+
sv = SparseVector(4, {1: 1, 3: 2})
73+
dv = array([1., 2., 3., 4.])
74+
lst = [4, 3, 2, 1]
75+
self.assertEquals(15.0, _squared_distance(sv, dv))
76+
self.assertEquals(25.0, _squared_distance(sv, lst))
77+
self.assertEquals(20.0, _squared_distance(dv, lst))
78+
self.assertEquals(15.0, _squared_distance(dv, sv))
79+
self.assertEquals(25.0, _squared_distance(lst, sv))
80+
self.assertEquals(20.0, _squared_distance(lst, dv))
81+
self.assertEquals(0.0, _squared_distance(sv, sv))
82+
self.assertEquals(0.0, _squared_distance(dv, dv))
83+
self.assertEquals(0.0, _squared_distance(lst, lst))
84+
85+
86+
class ListTests(PySparkTestCase):
87+
"""
88+
Test MLlib algorithms on plain lists, to make sure they're passed through
89+
as NumPy arrays.
90+
"""
91+
92+
def test_clustering(self):
93+
from pyspark.mllib.clustering import KMeans
94+
data = [
95+
[0, 1.1],
96+
[0, 1.2],
97+
[1.1, 0],
98+
[1.2, 0],
99+
]
100+
clusters = KMeans.train(self.sc.parallelize(data), 2, initializationMode="k-means||")
101+
self.assertEquals(clusters.predict(data[0]), clusters.predict(data[1]))
102+
self.assertEquals(clusters.predict(data[2]), clusters.predict(data[3]))
103+
104+
def test_classification(self):
105+
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
106+
data = [
107+
LabeledPoint(0.0, [1, 0]),
108+
LabeledPoint(1.0, [0, 1]),
109+
LabeledPoint(0.0, [2, 0]),
110+
LabeledPoint(1.0, [0, 2])
111+
]
112+
rdd = self.sc.parallelize(data)
113+
features = [p.features.tolist() for p in data]
114+
115+
lr_model = LogisticRegressionWithSGD.train(rdd)
116+
self.assertTrue(lr_model.predict(features[0]) <= 0)
117+
self.assertTrue(lr_model.predict(features[1]) > 0)
118+
self.assertTrue(lr_model.predict(features[2]) <= 0)
119+
self.assertTrue(lr_model.predict(features[3]) > 0)
120+
121+
svm_model = SVMWithSGD.train(rdd)
122+
self.assertTrue(svm_model.predict(features[0]) <= 0)
123+
self.assertTrue(svm_model.predict(features[1]) > 0)
124+
self.assertTrue(svm_model.predict(features[2]) <= 0)
125+
self.assertTrue(svm_model.predict(features[3]) > 0)
126+
127+
nb_model = NaiveBayes.train(rdd)
128+
self.assertTrue(nb_model.predict(features[0]) <= 0)
129+
self.assertTrue(nb_model.predict(features[1]) > 0)
130+
self.assertTrue(nb_model.predict(features[2]) <= 0)
131+
self.assertTrue(nb_model.predict(features[3]) > 0)
132+
133+
def test_regression(self):
134+
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
135+
RidgeRegressionWithSGD
136+
data = [
137+
LabeledPoint(-1.0, [0, -1]),
138+
LabeledPoint(1.0, [0, 1]),
139+
LabeledPoint(-1.0, [0, -2]),
140+
LabeledPoint(1.0, [0, 2])
141+
]
142+
rdd = self.sc.parallelize(data)
143+
features = [p.features.tolist() for p in data]
144+
145+
lr_model = LinearRegressionWithSGD.train(rdd)
146+
self.assertTrue(lr_model.predict(features[0]) <= 0)
147+
self.assertTrue(lr_model.predict(features[1]) > 0)
148+
self.assertTrue(lr_model.predict(features[2]) <= 0)
149+
self.assertTrue(lr_model.predict(features[3]) > 0)
150+
151+
lasso_model = LassoWithSGD.train(rdd)
152+
self.assertTrue(lasso_model.predict(features[0]) <= 0)
153+
self.assertTrue(lasso_model.predict(features[1]) > 0)
154+
self.assertTrue(lasso_model.predict(features[2]) <= 0)
155+
self.assertTrue(lasso_model.predict(features[3]) > 0)
156+
157+
rr_model = RidgeRegressionWithSGD.train(rdd)
158+
self.assertTrue(rr_model.predict(features[0]) <= 0)
159+
self.assertTrue(rr_model.predict(features[1]) > 0)
160+
self.assertTrue(rr_model.predict(features[2]) <= 0)
161+
self.assertTrue(rr_model.predict(features[3]) > 0)
62162

63163

64164
@unittest.skipIf(not _have_scipy, "SciPy not installed")
65165
class SciPyTests(PySparkTestCase):
166+
"""
167+
Test both vector operations and MLlib algorithms with SciPy sparse matrices,
168+
if SciPy is available.
169+
"""
170+
66171
def test_serialize(self):
67172
from scipy.sparse import lil_matrix
68173
lil = lil_matrix((4, 1))
@@ -132,63 +237,61 @@ def test_clustering(self):
132237
def test_classification(self):
133238
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
134239
data = [
135-
self.scipy_matrix(3, {0: 0.0, 2: 0.0}),
136-
self.scipy_matrix(3, {0: 1.0, 2: 1.0}),
137-
self.scipy_matrix(3, {0: 0.0, 2: 0.0}),
138-
self.scipy_matrix(3, {0: 1.0, 2: 2.0})
240+
LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
241+
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
242+
LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
243+
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
139244
]
140245
rdd = self.sc.parallelize(data)
141-
features = [
142-
self.scipy_matrix(2, {1: 0.0}),
143-
self.scipy_matrix(2, {1: 1.0}),
144-
self.scipy_matrix(2, {1: 2.0}),
145-
]
246+
features = [p.features for p in data]
146247

147248
lr_model = LogisticRegressionWithSGD.train(rdd)
148249
self.assertTrue(lr_model.predict(features[0]) <= 0)
149250
self.assertTrue(lr_model.predict(features[1]) > 0)
150-
self.assertTrue(lr_model.predict(features[2]) > 0)
251+
self.assertTrue(lr_model.predict(features[2]) <= 0)
252+
self.assertTrue(lr_model.predict(features[3]) > 0)
151253

152254
svm_model = SVMWithSGD.train(rdd)
153255
self.assertTrue(svm_model.predict(features[0]) <= 0)
154256
self.assertTrue(svm_model.predict(features[1]) > 0)
155-
self.assertTrue(svm_model.predict(features[2]) > 0)
257+
self.assertTrue(svm_model.predict(features[2]) <= 0)
258+
self.assertTrue(svm_model.predict(features[3]) > 0)
156259

157260
nb_model = NaiveBayes.train(rdd)
158261
self.assertTrue(nb_model.predict(features[0]) <= 0)
159262
self.assertTrue(nb_model.predict(features[1]) > 0)
160-
self.assertTrue(nb_model.predict(features[2]) > 0)
263+
self.assertTrue(nb_model.predict(features[2]) <= 0)
264+
self.assertTrue(nb_model.predict(features[3]) > 0)
161265

162266
def test_regression(self):
163267
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
164268
RidgeRegressionWithSGD
165269
data = [
166-
self.scipy_matrix(3, {0: -1.0, 2: -1.0}),
167-
self.scipy_matrix(3, {0: 1.0, 2: 1.0}),
168-
self.scipy_matrix(3, {0: -1.0, 2: -2.0}),
169-
self.scipy_matrix(3, {0: 1.0, 2: 2.0})
270+
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
271+
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
272+
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
273+
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
170274
]
171275
rdd = self.sc.parallelize(data)
172-
features = [
173-
self.scipy_matrix(2, {1: -1.0}),
174-
self.scipy_matrix(2, {1: 1.0}),
175-
self.scipy_matrix(2, {1: 2.0}),
176-
]
276+
features = [p.features for p in data]
177277

178278
lr_model = LinearRegressionWithSGD.train(rdd)
179279
self.assertTrue(lr_model.predict(features[0]) <= 0)
180280
self.assertTrue(lr_model.predict(features[1]) > 0)
181-
self.assertTrue(lr_model.predict(features[2]) > 0)
281+
self.assertTrue(lr_model.predict(features[2]) <= 0)
282+
self.assertTrue(lr_model.predict(features[3]) > 0)
182283

183284
lasso_model = LassoWithSGD.train(rdd)
184285
self.assertTrue(lasso_model.predict(features[0]) <= 0)
185286
self.assertTrue(lasso_model.predict(features[1]) > 0)
186-
self.assertTrue(lasso_model.predict(features[2]) > 0)
287+
self.assertTrue(lasso_model.predict(features[2]) <= 0)
288+
self.assertTrue(lasso_model.predict(features[3]) > 0)
187289

188290
rr_model = RidgeRegressionWithSGD.train(rdd)
189291
self.assertTrue(rr_model.predict(features[0]) <= 0)
190292
self.assertTrue(rr_model.predict(features[1]) > 0)
191-
self.assertTrue(rr_model.predict(features[2]) > 0)
293+
self.assertTrue(rr_model.predict(features[2]) <= 0)
294+
self.assertTrue(rr_model.predict(features[3]) > 0)
192295

193296

194297
if __name__ == "__main__":

python/run-tests

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ run_test "pyspark/mllib/clustering.py"
6060
run_test "pyspark/mllib/linalg.py"
6161
run_test "pyspark/mllib/recommendation.py"
6262
run_test "pyspark/mllib/regression.py"
63+
run_test "pyspark/mllib/tests.py"
6364

6465
if [[ $FAILED == 0 ]]; then
6566
echo -en "\033[32m" # Green

0 commit comments

Comments
 (0)