Skip to content

Commit affceb9

Browse files
committed
* Fixed bug in doc tests in pyspark/mllib/util.py caused by change in loadLibSVMFile behavior. (It used to threshold labels at 0 to make them 0/1, but it now leaves them as they are.)
* Fixed small bug in loadLibSVMFile: If a data file had no features, then loadLibSVMFile would create a single all-zero feature.
1 parent 67a29bc commit affceb9

File tree

1 file changed

+3
-6
lines changed

1 file changed

+3
-6
lines changed

python/pyspark/mllib/util.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,6 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
107107
>>> tempFile.write("+1 1:1.0 3:2.0 5:3.0\\n-1\\n-1 2:4.0 4:5.0 6:6.0")
108108
>>> tempFile.flush()
109109
>>> examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
110-
>>> multiclass_examples = MLUtils.loadLibSVMFile(sc, tempFile.name).collect()
111110
>>> tempFile.close()
112111
>>> type(examples[0]) == LabeledPoint
113112
True
@@ -116,20 +115,18 @@ def loadLibSVMFile(sc, path, numFeatures=-1, minPartitions=None):
116115
>>> type(examples[1]) == LabeledPoint
117116
True
118117
>>> print examples[1]
119-
(0.0,(6,[],[]))
118+
(-1.0,(6,[],[]))
120119
>>> type(examples[2]) == LabeledPoint
121120
True
122121
>>> print examples[2]
123-
(0.0,(6,[1,3,5],[4.0,5.0,6.0]))
124-
>>> multiclass_examples[1].label
125-
-1.0
122+
(-1.0,(6,[1,3,5],[4.0,5.0,6.0]))
126123
"""
127124

128125
lines = sc.textFile(path, minPartitions)
129126
parsed = lines.map(lambda l: MLUtils._parse_libsvm_line(l))
130127
if numFeatures <= 0:
131128
parsed.cache()
132-
numFeatures = parsed.map(lambda x: 0 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
129+
numFeatures = parsed.map(lambda x: -1 if x[1].size == 0 else x[1][-1]).reduce(max) + 1
133130
return parsed.map(lambda x: LabeledPoint(x[0], Vectors.sparse(numFeatures, x[1], x[2])))
134131

135132
@staticmethod

0 commit comments

Comments
 (0)