From 9f66e3bb0c5f9b917c1b31fd2aee3737ce28660c Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Fri, 30 Aug 2019 13:10:47 -0700 Subject: [PATCH 1/8] Initial test of WordTokenizer. --- src/DotNetBridge/NativeDataInterop.cs | 44 +++++++++ src/python/nimbusml.pyproj | 4 + .../WordTokenizer_df.py | 26 ++++++ .../core/preprocessing/text/wordtokenizer.py | 89 +++++++++++++++++++ .../entrypoints/transforms_wordtokenizer.py | 76 ++++++++++++++++ .../nimbusml/preprocessing/text/__init__.py | 4 +- .../preprocessing/text/wordtokenizer.py | 55 ++++++++++++ src/python/tools/entrypoint_compiler.py | 2 +- src/python/tools/manifest_diff.json | 6 ++ 9 files changed, 304 insertions(+), 2 deletions(-) create mode 100644 src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py create mode 100644 src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py create mode 100644 src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py create mode 100644 src/python/nimbusml/preprocessing/text/wordtokenizer.py diff --git a/src/DotNetBridge/NativeDataInterop.cs b/src/DotNetBridge/NativeDataInterop.cs index c9b70526..9f67595f 100644 --- a/src/DotNetBridge/NativeDataInterop.cs +++ b/src/DotNetBridge/NativeDataInterop.cs @@ -93,6 +93,47 @@ public ColumnMetadataInfo(bool expand, string[] slotNames, Dictionary(); + + for (int col = 0; col < schema.Count; col++) + { + if (schema[col].IsHidden) + continue; + + colIndices.Add(col); + } + + using (var cursor = view.GetRowCursor(view.Schema.Where(col => colIndices.Contains(col.Index)))) + { + var type = schema[colIndices[0]].Type; + + ValueGetter>> _getVec = + RowCursorUtils.GetVecGetterAs>((PrimitiveDataViewType)type.GetItemType(), cursor, colIndices[0]); + VBuffer> _buffer = new VBuffer>(); + + for (int crow = 0; ; crow++) + { + // Advance to the next row. + if (!cursor.MoveNext()) + break; + + string rowOutput = "["; + + _getVec(ref _buffer); + for (int i = 0; i < _buffer.Length; i++) + { + rowOutput += _buffer.GetValues()[i] + ", "; + } + rowOutput += "]"; + + System.Console.WriteLine(rowOutput); + } + } + } + private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, IDataView view, Dictionary infos = null) { Contracts.AssertValue(ch); @@ -117,6 +158,9 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, var expandCols = new HashSet(); var allNames = new HashSet(); + // Only works with WordTokenizer_df.py + PrintDataView(view); + for (int col = 0; col < schema.Count; col++) { if (schema[col].IsHidden) diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index 5daea049..9929408c 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -100,6 +100,7 @@ + @@ -299,6 +300,7 @@ + @@ -441,6 +443,7 @@ + @@ -637,6 +640,7 @@ + diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py new file mode 100644 index 00000000..4a7fa2e2 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py @@ -0,0 +1,26 @@ +############################################################################### +# WordTokenizer + +import pandas +from nimbusml import Pipeline, Role +from nimbusml.preprocessing.schema import ColumnConcatenator +from nimbusml.preprocessing.text import WordTokenizer + +# create the data +customer_reviews = pandas.DataFrame(data=dict(review=[ + "I really did not like the taste of it", + "It was surprisingly quite good!", + "I will never ever ever go to that place again!!", + "The best ever!! It was amazingly good and super fast", + "I wish I had gone earlier, it was that great", + "somewhat dissapointing. I'd probably wont try again", + "Never visit again... rascals!"])) + +tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << ['review'] + +pipeline = Pipeline([tokenize]) + +tokenize.fit(customer_reviews) +y = tokenize.transform(customer_reviews) + +print(y) diff --git a/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py b/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py new file mode 100644 index 00000000..66e06176 --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py @@ -0,0 +1,89 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +WordTokenizer +""" + +__all__ = ["WordTokenizer"] + + +from ....entrypoints.transforms_wordtokenizer import transforms_wordtokenizer +from ....utils.utils import trace +from ...base_pipeline_item import BasePipelineItem, DefaultSignature + + +class WordTokenizer(BasePipelineItem, DefaultSignature): + """ + **Description** + The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed. + + :param char_array_term_separators: Array of single character term + separator(s). By default uses space character separator. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + char_array_term_separators=None, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.char_array_term_separators = char_array_term_separators + + @property + def _entrypoint(self): + return transforms_wordtokenizer + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, + Name=o) for i, + o in zip( + input_columns, + output_columns)] if input_columns else None, + char_array_term_separators=self.char_array_term_separators) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py b/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py new file mode 100644 index 00000000..e7fac07a --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py @@ -0,0 +1,76 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.WordTokenizer +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_wordtokenizer( + data, + output_data=None, + model=None, + column=None, + char_array_term_separators=None, + **params): + """ + **Description** + The input to this transform is text, and the output is a vector of + text containing the words (tokens) in the original text. The + separator is space, but can be specified as any other + character (or multiple characters) if needed. + + :param column: New column definition(s) (inputs). + :param data: Input dataset (inputs). + :param char_array_term_separators: Array of single character term + separator(s). By default uses space character separator. + (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.WordTokenizer' + inputs = {} + outputs = {} + + if column is not None: + inputs['Column'] = try_set( + obj=column, + none_acceptable=True, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if char_array_term_separators is not None: + inputs['CharArrayTermSeparators'] = try_set( + obj=char_array_term_separators, + none_acceptable=True, + is_of_type=list) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/preprocessing/text/__init__.py b/src/python/nimbusml/preprocessing/text/__init__.py index b255f350..686e7c5a 100644 --- a/src/python/nimbusml/preprocessing/text/__init__.py +++ b/src/python/nimbusml/preprocessing/text/__init__.py @@ -1,5 +1,7 @@ from .chartokenizer import CharTokenizer +from .wordtokenizer import WordTokenizer __all__ = [ - 'CharTokenizer' + 'CharTokenizer', + 'WordTokenizer' ] diff --git a/src/python/nimbusml/preprocessing/text/wordtokenizer.py b/src/python/nimbusml/preprocessing/text/wordtokenizer.py new file mode 100644 index 00000000..54a0ffe0 --- /dev/null +++ b/src/python/nimbusml/preprocessing/text/wordtokenizer.py @@ -0,0 +1,55 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +WordTokenizer +""" + +__all__ = ["WordTokenizer"] + + +from sklearn.base import TransformerMixin + +from ...base_transform import BaseTransform +from ...internal.core.preprocessing.text.wordtokenizer import \ + WordTokenizer as core +from ...internal.utils.utils import trace + + +class WordTokenizer(core, BaseTransform, TransformerMixin): + """ + **Description** + The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed. + + :param columns: see `Columns `_. + + :param char_array_term_separators: Array of single character term + separator(s). By default uses space character separator. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + char_array_term_separators=None, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + char_array_term_separators=char_array_term_separators, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py index b2765691..ed829533 100644 --- a/src/python/tools/entrypoint_compiler.py +++ b/src/python/tools/entrypoint_compiler.py @@ -1457,7 +1457,7 @@ def parse_arg(argument, inout): assert not is_column arg_obj = NumericArrayArg(argument, inout) elif itemType in ["String", "DataView", "PredictorModel", - "TransformModel", "Node"]: + "TransformModel", "Node", "Char"]: arg_obj = StringArrayArg(argument, inout, is_column=is_column) elif isinstance(itemType, dict): diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json index d8a64d82..89e6cf7f 100644 --- a/src/python/tools/manifest_diff.json +++ b/src/python/tools/manifest_diff.json @@ -720,6 +720,12 @@ "Module": "preprocessing.text", "Type": "Transform" }, + { + "Name": "Transforms.WordTokenizer", + "NewName": "WordTokenizer", + "Module": "preprocessing.text", + "Type": "Transform" + }, { "Name": "Transforms.LightLda", "NewName": "LightLda", From a0e364092117ee878284d3974ba53cb896ff36cd Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Thu, 3 Oct 2019 10:12:59 -0700 Subject: [PATCH 2/8] Remove debug code that accidentally made it througth the merge. --- src/DotNetBridge/NativeDataInterop.cs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/DotNetBridge/NativeDataInterop.cs b/src/DotNetBridge/NativeDataInterop.cs index d1e43814..b7f1a762 100644 --- a/src/DotNetBridge/NativeDataInterop.cs +++ b/src/DotNetBridge/NativeDataInterop.cs @@ -122,9 +122,6 @@ private static unsafe void SendViewToNativeAsDataFrame(IChannel ch, EnvironmentB var expandCols = new HashSet(1000); var valueCounts = new List(1000); - // Only works with WordTokenizer_df.py - PrintDataView(view); - for (int col = 0; col < schema.Count; col++) { if (schema[col].IsHidden) From 414fb4b391fbb900f6eac71c90d5d25975ddb92a Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Thu, 3 Oct 2019 10:30:09 -0700 Subject: [PATCH 3/8] Update the WordTokenizer_df example. --- .../examples_from_dataframe/WordTokenizer_df.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py index 4a7fa2e2..8c0e2362 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py @@ -16,7 +16,7 @@ "somewhat dissapointing. I'd probably wont try again", "Never visit again... rascals!"])) -tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << ['review'] +tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review' pipeline = Pipeline([tokenize]) @@ -24,3 +24,11 @@ y = tokenize.transform(customer_reviews) print(y) +# review.00 review.01 review.02 review.03 review.04 review.05 review.06 review.07 review.08 review.09 review.10 review.11 +# 0 I really did ot like the taste of it None None None +# 1 It was surprisi gly quite good! None None None None None None +# 2 I will ever ever ever go to that place agai !! None +# 3 The best ever!! It was amazi gly good a d super fast +# 4 I wish I had go e earlier, it was that great None +# 5 somewhat dissapoi ti g. I'd probably wo t try agai None None +# 6 Never visit agai ... rascals! None None None None None None None From cc99e17e855162c907089e3f9be799fc5cf79d09 Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Thu, 3 Oct 2019 10:45:37 -0700 Subject: [PATCH 4/8] Remove unnecessary import from WordTokenizer_df. --- .../examples/examples_from_dataframe/WordTokenizer_df.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py index 8c0e2362..31980567 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py @@ -2,8 +2,7 @@ # WordTokenizer import pandas -from nimbusml import Pipeline, Role -from nimbusml.preprocessing.schema import ColumnConcatenator +from nimbusml import Pipeline from nimbusml.preprocessing.text import WordTokenizer # create the data From 68c739e0bed7b22c03989136a53058f7ad0540df Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Thu, 3 Oct 2019 10:47:28 -0700 Subject: [PATCH 5/8] Add WordTokenizer example. --- src/python/nimbusml.pyproj | 1 + src/python/nimbusml/examples/WordTokenizer.py | 32 +++++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 src/python/nimbusml/examples/WordTokenizer.py diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index dbb24f72..1f9cab3a 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -92,6 +92,7 @@ + diff --git a/src/python/nimbusml/examples/WordTokenizer.py b/src/python/nimbusml/examples/WordTokenizer.py new file mode 100644 index 00000000..0f4cc76a --- /dev/null +++ b/src/python/nimbusml/examples/WordTokenizer.py @@ -0,0 +1,32 @@ +############################################################################### +# WordTokenizer + +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing.text import WordTokenizer + +# data input (as a FileDataStream) +path = get_dataset("wiki_detox_train").as_filepath() + +data = FileDataStream.read_csv(path, sep='\t') +print(data.head()) +# Sentiment SentimentText +# 0 1 ==RUDE== Dude, you are rude upload that carl p... +# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... +# 2 1 Stop trolling, zapatancas, calling me a liar m... +# 3 1 ==You're cool== You seem like a really cool g... +# 4 1 ::::: Why are you threatening me? I'm not bein... + +tokenize = WordTokenizer(char_array_term_separators=[" "]) << {'wt': 'SentimentText'} +pipeline = Pipeline([tokenize]) + +tokenize.fit(data) +y = tokenize.transform(data) + +print(y.drop(labels='SentimentText', axis=1).head()) +# Sentiment wt.000 wt.001 wt.002 wt.003 wt.004 wt.005 ... wt.366 wt.367 wt.368 wt.369 wt.370 wt.371 wt.372 +# 0 1 ==RUDE== Dude, you are rude upload ... None None None None None None None +# 1 1 == OK! == IM GOING TO ... None None None None None None None +# 2 1 Stop trolling, zapatancas, calling me a ... None None None None None None None +# 3 1 ==You're cool== You seem like a ... None None None None None None None +# 4 1 ::::: Why are you threatening me? ... None None None None None None None From 1169fe055824f31710def283924489cfdd38ffd8 Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Thu, 3 Oct 2019 11:03:22 -0700 Subject: [PATCH 6/8] Add initial unit test for WordTokenizer. --- src/python/nimbusml.pyproj | 1 + .../preprocessing/text/test_wordtokenizer.py | 33 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index 1f9cab3a..f21bc0c3 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -690,6 +690,7 @@ + diff --git a/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py b/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py new file mode 100644 index 00000000..a8c66016 --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py @@ -0,0 +1,33 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import unittest + +import pandas +from nimbusml import Pipeline +from nimbusml.preprocessing.text import WordTokenizer + + +class TestWordTokenizer(unittest.TestCase): + + def test_wordtokenizer(self): + customer_reviews = pandas.DataFrame(data=dict(review=[ + "I really did not like the taste of it", + "It was surprisingly quite good!"])) + + tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review' + pipeline = Pipeline([tokenize]) + + tokenize.fit(customer_reviews) + y = tokenize.transform(customer_reviews) + + self.assertEqual(y.shape, (2, 9)) + + self.assertEqual(y.loc[0, 'review.3'], 'ot') + self.assertEqual(y.loc[1, 'review.3'], 'gly') + self.assertEqual(y.loc[1, 'review.6'], None) + + +if __name__ == '__main__': + unittest.main() From 954e8b6a05c0861964baad1e4065b89f8b421594 Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Thu, 3 Oct 2019 11:15:22 -0700 Subject: [PATCH 7/8] Excluded WordTokenizer from most tests in test_estimator_checks. --- src/python/tests/test_estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py index eb39246b..e47ce638 100644 --- a/src/python/tests/test_estimator_checks.py +++ b/src/python/tests/test_estimator_checks.py @@ -160,7 +160,7 @@ 'PixelExtractor, Loader, Resizer, \ GlobalContrastRowScaler, PcaTransformer, ' 'ColumnConcatenator, Sentiment, CharTokenizer, LightLda, ' - 'NGramFeaturizer, WordEmbedding, LpScaler', + 'NGramFeaturizer, WordEmbedding, LpScaler, WordTokenizer', 'check_transformer_data_not_an_array, check_pipeline_consistency, ' 'check_fit2d_1feature, check_estimators_fit_returns_self,\ check_fit2d_1sample, ' From 10a384d3847b19d8baec7f4a2de3b211046f611d Mon Sep 17 00:00:00 2001 From: "pieths.dev@gmail.com" Date: Thu, 3 Oct 2019 14:02:15 -0700 Subject: [PATCH 8/8] Whitespace change to restart ci run. Mac run lost communication. --- src/python/nimbusml/examples/WordTokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/nimbusml/examples/WordTokenizer.py b/src/python/nimbusml/examples/WordTokenizer.py index 0f4cc76a..028d5d7e 100644 --- a/src/python/nimbusml/examples/WordTokenizer.py +++ b/src/python/nimbusml/examples/WordTokenizer.py @@ -1,5 +1,5 @@ ############################################################################### -# WordTokenizer +# WordTokenizer from nimbusml import Pipeline, FileDataStream from nimbusml.datasets import get_dataset