include changes made by SPARK-11569

VinceShieh · VinceShieh · commit 1d2f28f24496 · 2017-03-17T10:55:15.000+08:00
Signed-off-by: VinceShieh &lt;vincent.xie@intel.com&gt;
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
@@ -1936,7 +1936,7 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja
     ...     key=lambda x: x[0])
     [(0, 'a'), (1, 'b'), (2, 'c'), (3, 'a'), (4, 'a'), (5, 'c')]
     >>> testData2 = sc.parallelize([Row(id=0, label="a"), Row(id=1, label="d"),
-    ...     Row(id=2, label="e")], 2)
+    ...     Row(id=2, label=None)], 2)
     >>> dfKeep= spark.createDataFrame(testData2)
     >>> modelKeep = stringIndexer.setHandleInvalid("keep").fit(stringIndDf)
     >>> tdK = modelKeep.transform(dfKeep)
@@ -1962,10 +1962,10 @@ class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja
     .. versionadded:: 1.4.0
     """
 
-    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle unseen labels. " +
-                          "Options are 'skip' (filter out rows with unseen labels), " +
-                          "error (throw an error), or 'keep' (put unseen labels in a special " +
-                          "additional bucket, at index numLabels).",
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid data (unseen " +
+                          "labels or NULL values). Options are 'skip' (filter out rows with " +
+                          "invalid data), error (throw an error), or 'keep' (put invalid data " +
+                          "in a special additional bucket, at index numLabels).",
                           typeConverter=TypeConverters.toString)
 
     @keyword_only