-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-9679][ML][PYSPARK] Add Python API for Stop Words Remover #8118
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
78c0362
7a65dc3
c634fa1
ec6baab
b98bb1d
d8e3672
12fb73e
c366fd6
84bc507
acfc9fe
53f97b7
7767df0
e36703e
345bde2
c6dee1b
62b821a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,15 +22,15 @@ | |
| from pyspark.rdd import ignore_unicode_prefix | ||
| from pyspark.ml.param.shared import * | ||
| from pyspark.ml.util import keyword_only | ||
| from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer | ||
| from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer, _jvm | ||
| from pyspark.mllib.common import inherit_doc | ||
| from pyspark.mllib.linalg import _convert_to_vector | ||
|
|
||
| __all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', | ||
| 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', | ||
| 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 'StringIndexer', | ||
| 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', | ||
| 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel'] | ||
| 'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel', 'StopWordsRemover'] | ||
|
|
||
|
|
||
| @inherit_doc | ||
|
|
@@ -933,6 +933,75 @@ class StringIndexerModel(JavaModel): | |
| """ | ||
|
|
||
|
|
||
| class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol): | ||
| """ | ||
| .. note:: Experimental | ||
|
|
||
| A feature transformer that filters out stop words from input. | ||
| Note: null values from input array are preserved unless adding null to stopWords explicitly. | ||
| """ | ||
| # a placeholder to make the stopwords show up in generated doc | ||
| stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we also provide
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks! |
||
| caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " + | ||
| "comparison over the stop words") | ||
|
|
||
| @keyword_only | ||
| def __init__(self, inputCol=None, outputCol=None, stopWords=None, | ||
| caseSensitive=False): | ||
| """ | ||
| __init__(self, inputCol=None, outputCol=None, stopWords=None,\ | ||
| caseSensitive=false) | ||
| """ | ||
| super(StopWordsRemover, self).__init__() | ||
| self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", | ||
| self.uid) | ||
| self.stopWords = Param(self, "stopWords", "The words to be filtered out") | ||
| self.caseSensitive = Param(self, "caseSensitive", "whether to do a case " + | ||
| "sensitive comparison over the stop words") | ||
| stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords | ||
| defaultStopWords = stopWordsObj.English() | ||
| self._setDefault(stopWords=defaultStopWords) | ||
| kwargs = self.__init__._input_kwargs | ||
| self.setParams(**kwargs) | ||
|
|
||
| @keyword_only | ||
| def setParams(self, inputCol=None, outputCol=None, stopWords=None, | ||
| caseSensitive=False): | ||
| """ | ||
| setParams(self, inputCol="input", outputCol="output", stopWords=None,\ | ||
| caseSensitive=false) | ||
| Sets params for this StopWordRemover. | ||
| """ | ||
| kwargs = self.setParams._input_kwargs | ||
| return self._set(**kwargs) | ||
|
|
||
| def setStopWords(self, value): | ||
| """ | ||
| Specify the stopwords to be filtered. | ||
| """ | ||
| self._paramMap[self.stopWords] = value | ||
| return self | ||
|
|
||
| def getStopWords(self): | ||
| """ | ||
| Get the stopwords. | ||
| """ | ||
| return self.getOrDefault(self.stopWords) | ||
|
|
||
| def setCaseSensitive(self, value): | ||
| """ | ||
| Set whether to do a case sensitive comparison over the stop words | ||
| """ | ||
| self._paramMap[self.caseSensitive] = value | ||
| return self | ||
|
|
||
| def getCaseSensitive(self): | ||
| """ | ||
| Get whether to do a case sensitive comparison over the stop words. | ||
| """ | ||
| return self.getOrDefault(self.caseSensitive) | ||
|
|
||
|
|
||
| @inherit_doc | ||
| @ignore_unicode_prefix | ||
| class Tokenizer(JavaTransformer, HasInputCol, HasOutputCol): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
style checker wants 2 blank lines