Skip to content

Commit 1dc30f1

Browse files
BryanCutlerrxin
authored andcommitted
[DOC][MINOR] ml.feature Scala and Python API sync
I reviewed Scala and Python APIs for ml.feature and corrected discrepancies. Built docs locally, ran style checks Author: Bryan Cutler <[email protected]> Closes #13159 from BryanCutler/ml.feature-api-sync. (cherry picked from commit b1bc5eb) Signed-off-by: Reynold Xin <[email protected]>
1 parent dcf36ad commit 1dc30f1

File tree

5 files changed

+36
-19
lines changed

5 files changed

+36
-19
lines changed

mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,12 @@ import org.apache.spark.sql.types.StructType
3838
private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol {
3939

4040
/**
41-
* The minimum of documents in which a term should appear.
41+
* The minimum number of documents in which a term should appear.
4242
* Default: 0
4343
* @group param
4444
*/
4545
final val minDocFreq = new IntParam(
46-
this, "minDocFreq", "minimum of documents in which a term should appear for filtering")
46+
this, "minDocFreq", "minimum number of documents in which a term should appear for filtering")
4747

4848
setDefault(minDocFreq -> 0)
4949

mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ private[feature] trait PCAParams extends Params with HasInputCol with HasOutputC
5353

5454
/**
5555
* :: Experimental ::
56-
* PCA trains a model to project vectors to a low-dimensional space using PCA.
56+
* PCA trains a model to project vectors to a lower dimensional space of the top [[PCA!.k]]
57+
* principal components.
5758
*/
5859
@Experimental
5960
class PCA (override val uid: String) extends Estimator[PCAModel] with PCAParams
@@ -106,7 +107,7 @@ object PCA extends DefaultParamsReadable[PCA] {
106107

107108
/**
108109
* :: Experimental ::
109-
* Model fitted by [[PCA]].
110+
* Model fitted by [[PCA]]. Transforms vectors to a lower dimensional space.
110111
*
111112
* @param pc A principal components Matrix. Each column is one principal component.
112113
* @param explainedVariance A vector of proportions of variance explained by

mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,9 @@ object RFormula extends DefaultParamsReadable[RFormula] {
194194

195195
/**
196196
* :: Experimental ::
197-
* A fitted RFormula. Fitting is required to determine the factor levels of formula terms.
197+
* Model fitted by [[RFormula]]. Fitting is required to determine the factor levels of
198+
* formula terms.
199+
*
198200
* @param resolvedFormula the fitted R formula.
199201
* @param pipelineModel the fitted feature model, including factor to index mappings.
200202
*/

mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,8 @@ object VectorIndexer extends DefaultParamsReadable[VectorIndexer] {
240240

241241
/**
242242
* :: Experimental ::
243-
* Transform categorical features to use 0-based indices instead of their original values.
243+
* Model fitted by [[VectorIndexer]]. Transform categorical features to use 0-based indices
244+
* instead of their original values.
244245
* - Categorical features are mapped to indices.
245246
* - Continuous features (columns) are left unchanged.
246247
* This also appends metadata to the output column, marking features as Numeric (continuous),

python/pyspark/ml/feature.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ class CountVectorizerModel(JavaModel, JavaMLReadable, JavaMLWritable):
352352
"""
353353
.. note:: Experimental
354354
355-
Model fitted by CountVectorizer.
355+
Model fitted by :py:class:`CountVectorizer`.
356356
357357
.. versionadded:: 1.6.0
358358
"""
@@ -609,7 +609,7 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab
609609
"""
610610

611611
minDocFreq = Param(Params._dummy(), "minDocFreq",
612-
"minimum of documents in which a term should appear for filtering",
612+
"minimum number of documents in which a term should appear for filtering",
613613
typeConverter=TypeConverters.toInt)
614614

615615
@keyword_only
@@ -655,7 +655,7 @@ class IDFModel(JavaModel, JavaMLReadable, JavaMLWritable):
655655
"""
656656
.. note:: Experimental
657657
658-
Model fitted by IDF.
658+
Model fitted by :py:class:`IDF`.
659659
660660
.. versionadded:: 1.4.0
661661
"""
@@ -1302,7 +1302,8 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
13021302

13031303
minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)",
13041304
typeConverter=TypeConverters.toInt)
1305-
gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
1305+
gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens " +
1306+
"(False)")
13061307
pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing",
13071308
typeConverter=TypeConverters.toString)
13081309
toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert all characters to " +
@@ -1549,7 +1550,7 @@ class StandardScalerModel(JavaModel, JavaMLReadable, JavaMLWritable):
15491550
"""
15501551
.. note:: Experimental
15511552
1552-
Model fitted by StandardScaler.
1553+
Model fitted by :py:class:`StandardScaler`.
15531554
15541555
.. versionadded:: 1.4.0
15551556
"""
@@ -1641,7 +1642,7 @@ class StringIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable):
16411642
"""
16421643
.. note:: Experimental
16431644
1644-
Model fitted by StringIndexer.
1645+
Model fitted by :py:class:`StringIndexer`.
16451646
16461647
.. versionadded:: 1.4.0
16471648
"""
@@ -1907,7 +1908,7 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja
19071908
"""
19081909
.. note:: Experimental
19091910
1910-
Class for indexing categorical feature columns in a dataset of [[Vector]].
1911+
Class for indexing categorical feature columns in a dataset of `Vector`.
19111912
19121913
This has 2 usage modes:
19131914
- Automatically identify categorical features (default behavior)
@@ -2023,7 +2024,17 @@ class VectorIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable):
20232024
"""
20242025
.. note:: Experimental
20252026
2026-
Model fitted by VectorIndexer.
2027+
Model fitted by :py:class:`VectorIndexer`.
2028+
2029+
Transform categorical features to use 0-based indices instead of their original values.
2030+
- Categorical features are mapped to indices.
2031+
- Continuous features (columns) are left unchanged.
2032+
2033+
This also appends metadata to the output column, marking features as Numeric (continuous),
2034+
Nominal (categorical), or Binary (either continuous or categorical).
2035+
Non-ML metadata is not carried over from the input to the output column.
2036+
2037+
This maintains vector sparsity.
20272038
20282039
.. versionadded:: 1.4.0
20292040
"""
@@ -2296,7 +2307,7 @@ class Word2VecModel(JavaModel, JavaMLReadable, JavaMLWritable):
22962307
"""
22972308
.. note:: Experimental
22982309
2299-
Model fitted by Word2Vec.
2310+
Model fitted by :py:class:`Word2Vec`.
23002311
23012312
.. versionadded:: 1.4.0
23022313
"""
@@ -2327,7 +2338,8 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab
23272338
"""
23282339
.. note:: Experimental
23292340
2330-
PCA trains a model to project vectors to a low-dimensional space using PCA.
2341+
PCA trains a model to project vectors to a lower dimensional space of the
2342+
top :py:attr:`k` principal components.
23312343
23322344
>>> from pyspark.ml.linalg import Vectors
23332345
>>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
@@ -2401,7 +2413,7 @@ class PCAModel(JavaModel, JavaMLReadable, JavaMLWritable):
24012413
"""
24022414
.. note:: Experimental
24032415
2404-
Model fitted by PCA.
2416+
Model fitted by :py:class:`PCA`. Transforms vectors to a lower dimensional space.
24052417
24062418
.. versionadded:: 1.5.0
24072419
"""
@@ -2532,7 +2544,8 @@ class RFormulaModel(JavaModel, JavaMLReadable, JavaMLWritable):
25322544
"""
25332545
.. note:: Experimental
25342546
2535-
Model fitted by :py:class:`RFormula`.
2547+
Model fitted by :py:class:`RFormula`. Fitting is required to determine the
2548+
factor levels of formula terms.
25362549
25372550
.. versionadded:: 1.5.0
25382551
"""
@@ -2624,7 +2637,7 @@ class ChiSqSelectorModel(JavaModel, JavaMLReadable, JavaMLWritable):
26242637
"""
26252638
.. note:: Experimental
26262639
2627-
Model fitted by ChiSqSelector.
2640+
Model fitted by :py:class:`ChiSqSelector`.
26282641
26292642
.. versionadded:: 2.0.0
26302643
"""

0 commit comments

Comments
 (0)