diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index eeedbdad..f21bc0c3 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -92,6 +92,7 @@ + @@ -102,6 +103,7 @@ + @@ -306,6 +308,7 @@ + @@ -451,6 +454,7 @@ + @@ -649,6 +653,7 @@ + @@ -685,6 +690,7 @@ + diff --git a/src/python/nimbusml/examples/WordTokenizer.py b/src/python/nimbusml/examples/WordTokenizer.py new file mode 100644 index 00000000..028d5d7e --- /dev/null +++ b/src/python/nimbusml/examples/WordTokenizer.py @@ -0,0 +1,32 @@ +############################################################################### +# WordTokenizer + +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing.text import WordTokenizer + +# data input (as a FileDataStream) +path = get_dataset("wiki_detox_train").as_filepath() + +data = FileDataStream.read_csv(path, sep='\t') +print(data.head()) +# Sentiment SentimentText +# 0 1 ==RUDE== Dude, you are rude upload that carl p... +# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... +# 2 1 Stop trolling, zapatancas, calling me a liar m... +# 3 1 ==You're cool== You seem like a really cool g... +# 4 1 ::::: Why are you threatening me? I'm not bein... + +tokenize = WordTokenizer(char_array_term_separators=[" "]) << {'wt': 'SentimentText'} +pipeline = Pipeline([tokenize]) + +tokenize.fit(data) +y = tokenize.transform(data) + +print(y.drop(labels='SentimentText', axis=1).head()) +# Sentiment wt.000 wt.001 wt.002 wt.003 wt.004 wt.005 ... wt.366 wt.367 wt.368 wt.369 wt.370 wt.371 wt.372 +# 0 1 ==RUDE== Dude, you are rude upload ... None None None None None None None +# 1 1 == OK! == IM GOING TO ... None None None None None None None +# 2 1 Stop trolling, zapatancas, calling me a ... None None None None None None None +# 3 1 ==You're cool== You seem like a ... None None None None None None None +# 4 1 ::::: Why are you threatening me? ... None None None None None None None diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py new file mode 100644 index 00000000..31980567 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py @@ -0,0 +1,33 @@ +############################################################################### +# WordTokenizer + +import pandas +from nimbusml import Pipeline +from nimbusml.preprocessing.text import WordTokenizer + +# create the data +customer_reviews = pandas.DataFrame(data=dict(review=[ + "I really did not like the taste of it", + "It was surprisingly quite good!", + "I will never ever ever go to that place again!!", + "The best ever!! It was amazingly good and super fast", + "I wish I had gone earlier, it was that great", + "somewhat dissapointing. I'd probably wont try again", + "Never visit again... rascals!"])) + +tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review' + +pipeline = Pipeline([tokenize]) + +tokenize.fit(customer_reviews) +y = tokenize.transform(customer_reviews) + +print(y) +# review.00 review.01 review.02 review.03 review.04 review.05 review.06 review.07 review.08 review.09 review.10 review.11 +# 0 I really did ot like the taste of it None None None +# 1 It was surprisi gly quite good! None None None None None None +# 2 I will ever ever ever go to that place agai !! None +# 3 The best ever!! It was amazi gly good a d super fast +# 4 I wish I had go e earlier, it was that great None +# 5 somewhat dissapoi ti g. I'd probably wo t try agai None None +# 6 Never visit agai ... rascals! None None None None None None None diff --git a/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py b/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py new file mode 100644 index 00000000..66e06176 --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py @@ -0,0 +1,89 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +WordTokenizer +""" + +__all__ = ["WordTokenizer"] + + +from ....entrypoints.transforms_wordtokenizer import transforms_wordtokenizer +from ....utils.utils import trace +from ...base_pipeline_item import BasePipelineItem, DefaultSignature + + +class WordTokenizer(BasePipelineItem, DefaultSignature): + """ + **Description** + The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed. + + :param char_array_term_separators: Array of single character term + separator(s). By default uses space character separator. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + char_array_term_separators=None, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.char_array_term_separators = char_array_term_separators + + @property + def _entrypoint(self): + return transforms_wordtokenizer + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, + Name=o) for i, + o in zip( + input_columns, + output_columns)] if input_columns else None, + char_array_term_separators=self.char_array_term_separators) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py b/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py new file mode 100644 index 00000000..e7fac07a --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py @@ -0,0 +1,76 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.WordTokenizer +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_wordtokenizer( + data, + output_data=None, + model=None, + column=None, + char_array_term_separators=None, + **params): + """ + **Description** + The input to this transform is text, and the output is a vector of + text containing the words (tokens) in the original text. The + separator is space, but can be specified as any other + character (or multiple characters) if needed. + + :param column: New column definition(s) (inputs). + :param data: Input dataset (inputs). + :param char_array_term_separators: Array of single character term + separator(s). By default uses space character separator. + (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.WordTokenizer' + inputs = {} + outputs = {} + + if column is not None: + inputs['Column'] = try_set( + obj=column, + none_acceptable=True, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if char_array_term_separators is not None: + inputs['CharArrayTermSeparators'] = try_set( + obj=char_array_term_separators, + none_acceptable=True, + is_of_type=list) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/preprocessing/text/__init__.py b/src/python/nimbusml/preprocessing/text/__init__.py index b255f350..686e7c5a 100644 --- a/src/python/nimbusml/preprocessing/text/__init__.py +++ b/src/python/nimbusml/preprocessing/text/__init__.py @@ -1,5 +1,7 @@ from .chartokenizer import CharTokenizer +from .wordtokenizer import WordTokenizer __all__ = [ - 'CharTokenizer' + 'CharTokenizer', + 'WordTokenizer' ] diff --git a/src/python/nimbusml/preprocessing/text/wordtokenizer.py b/src/python/nimbusml/preprocessing/text/wordtokenizer.py new file mode 100644 index 00000000..54a0ffe0 --- /dev/null +++ b/src/python/nimbusml/preprocessing/text/wordtokenizer.py @@ -0,0 +1,55 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +WordTokenizer +""" + +__all__ = ["WordTokenizer"] + + +from sklearn.base import TransformerMixin + +from ...base_transform import BaseTransform +from ...internal.core.preprocessing.text.wordtokenizer import \ + WordTokenizer as core +from ...internal.utils.utils import trace + + +class WordTokenizer(core, BaseTransform, TransformerMixin): + """ + **Description** + The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed. + + :param columns: see `Columns `_. + + :param char_array_term_separators: Array of single character term + separator(s). By default uses space character separator. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + char_array_term_separators=None, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + char_array_term_separators=char_array_term_separators, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py b/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py new file mode 100644 index 00000000..a8c66016 --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py @@ -0,0 +1,33 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import unittest + +import pandas +from nimbusml import Pipeline +from nimbusml.preprocessing.text import WordTokenizer + + +class TestWordTokenizer(unittest.TestCase): + + def test_wordtokenizer(self): + customer_reviews = pandas.DataFrame(data=dict(review=[ + "I really did not like the taste of it", + "It was surprisingly quite good!"])) + + tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review' + pipeline = Pipeline([tokenize]) + + tokenize.fit(customer_reviews) + y = tokenize.transform(customer_reviews) + + self.assertEqual(y.shape, (2, 9)) + + self.assertEqual(y.loc[0, 'review.3'], 'ot') + self.assertEqual(y.loc[1, 'review.3'], 'gly') + self.assertEqual(y.loc[1, 'review.6'], None) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py index eb39246b..e47ce638 100644 --- a/src/python/tests/test_estimator_checks.py +++ b/src/python/tests/test_estimator_checks.py @@ -160,7 +160,7 @@ 'PixelExtractor, Loader, Resizer, \ GlobalContrastRowScaler, PcaTransformer, ' 'ColumnConcatenator, Sentiment, CharTokenizer, LightLda, ' - 'NGramFeaturizer, WordEmbedding, LpScaler', + 'NGramFeaturizer, WordEmbedding, LpScaler, WordTokenizer', 'check_transformer_data_not_an_array, check_pipeline_consistency, ' 'check_fit2d_1feature, check_estimators_fit_returns_self,\ check_fit2d_1sample, ' diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py index b2765691..ed829533 100644 --- a/src/python/tools/entrypoint_compiler.py +++ b/src/python/tools/entrypoint_compiler.py @@ -1457,7 +1457,7 @@ def parse_arg(argument, inout): assert not is_column arg_obj = NumericArrayArg(argument, inout) elif itemType in ["String", "DataView", "PredictorModel", - "TransformModel", "Node"]: + "TransformModel", "Node", "Char"]: arg_obj = StringArrayArg(argument, inout, is_column=is_column) elif isinstance(itemType, dict): diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json index 817dca50..cddfaf25 100644 --- a/src/python/tools/manifest_diff.json +++ b/src/python/tools/manifest_diff.json @@ -733,6 +733,12 @@ "Module": "preprocessing.text", "Type": "Transform" }, + { + "Name": "Transforms.WordTokenizer", + "NewName": "WordTokenizer", + "Module": "preprocessing.text", + "Type": "Transform" + }, { "Name": "Transforms.LightLda", "NewName": "LightLda",