diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
index eeedbdad..f21bc0c3 100644
--- a/src/python/nimbusml.pyproj
+++ b/src/python/nimbusml.pyproj
@@ -92,6 +92,7 @@
+
@@ -102,6 +103,7 @@
+
@@ -306,6 +308,7 @@
+
@@ -451,6 +454,7 @@
+
@@ -649,6 +653,7 @@
+
@@ -685,6 +690,7 @@
+
diff --git a/src/python/nimbusml/examples/WordTokenizer.py b/src/python/nimbusml/examples/WordTokenizer.py
new file mode 100644
index 00000000..028d5d7e
--- /dev/null
+++ b/src/python/nimbusml/examples/WordTokenizer.py
@@ -0,0 +1,32 @@
+###############################################################################
+# WordTokenizer
+
+from nimbusml import Pipeline, FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.preprocessing.text import WordTokenizer
+
+# data input (as a FileDataStream)
+path = get_dataset("wiki_detox_train").as_filepath()
+
+data = FileDataStream.read_csv(path, sep='\t')
+print(data.head())
+# Sentiment SentimentText
+# 0 1 ==RUDE== Dude, you are rude upload that carl p...
+# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK...
+# 2 1 Stop trolling, zapatancas, calling me a liar m...
+# 3 1 ==You're cool== You seem like a really cool g...
+# 4 1 ::::: Why are you threatening me? I'm not bein...
+
+tokenize = WordTokenizer(char_array_term_separators=[" "]) << {'wt': 'SentimentText'}
+pipeline = Pipeline([tokenize])
+
+tokenize.fit(data)
+y = tokenize.transform(data)
+
+print(y.drop(labels='SentimentText', axis=1).head())
+# Sentiment wt.000 wt.001 wt.002 wt.003 wt.004 wt.005 ... wt.366 wt.367 wt.368 wt.369 wt.370 wt.371 wt.372
+# 0 1 ==RUDE== Dude, you are rude upload ... None None None None None None None
+# 1 1 == OK! == IM GOING TO ... None None None None None None None
+# 2 1 Stop trolling, zapatancas, calling me a ... None None None None None None None
+# 3 1 ==You're cool== You seem like a ... None None None None None None None
+# 4 1 ::::: Why are you threatening me? ... None None None None None None None
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py
new file mode 100644
index 00000000..31980567
--- /dev/null
+++ b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py
@@ -0,0 +1,33 @@
+###############################################################################
+# WordTokenizer
+
+import pandas
+from nimbusml import Pipeline
+from nimbusml.preprocessing.text import WordTokenizer
+
+# create the data
+customer_reviews = pandas.DataFrame(data=dict(review=[
+ "I really did not like the taste of it",
+ "It was surprisingly quite good!",
+ "I will never ever ever go to that place again!!",
+ "The best ever!! It was amazingly good and super fast",
+ "I wish I had gone earlier, it was that great",
+ "somewhat dissapointing. I'd probably wont try again",
+ "Never visit again... rascals!"]))
+
+tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review'
+
+pipeline = Pipeline([tokenize])
+
+tokenize.fit(customer_reviews)
+y = tokenize.transform(customer_reviews)
+
+print(y)
+# review.00 review.01 review.02 review.03 review.04 review.05 review.06 review.07 review.08 review.09 review.10 review.11
+# 0 I really did ot like the taste of it None None None
+# 1 It was surprisi gly quite good! None None None None None None
+# 2 I will ever ever ever go to that place agai !! None
+# 3 The best ever!! It was amazi gly good a d super fast
+# 4 I wish I had go e earlier, it was that great None
+# 5 somewhat dissapoi ti g. I'd probably wo t try agai None None
+# 6 Never visit agai ... rascals! None None None None None None None
diff --git a/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py b/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py
new file mode 100644
index 00000000..66e06176
--- /dev/null
+++ b/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py
@@ -0,0 +1,89 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+WordTokenizer
+"""
+
+__all__ = ["WordTokenizer"]
+
+
+from ....entrypoints.transforms_wordtokenizer import transforms_wordtokenizer
+from ....utils.utils import trace
+from ...base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class WordTokenizer(BasePipelineItem, DefaultSignature):
+ """
+ **Description**
+ The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.
+
+ :param char_array_term_separators: Array of single character term
+ separator(s). By default uses space character separator.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ char_array_term_separators=None,
+ **params):
+ BasePipelineItem.__init__(
+ self, type='transform', **params)
+
+ self.char_array_term_separators = char_array_term_separators
+
+ @property
+ def _entrypoint(self):
+ return transforms_wordtokenizer
+
+ @trace
+ def _get_node(self, **all_args):
+
+ input_columns = self.input
+ if input_columns is None and 'input' in all_args:
+ input_columns = all_args['input']
+ if 'input' in all_args:
+ all_args.pop('input')
+
+ output_columns = self.output
+ if output_columns is None and 'output' in all_args:
+ output_columns = all_args['output']
+ if 'output' in all_args:
+ all_args.pop('output')
+
+ # validate input
+ if input_columns is None:
+ raise ValueError(
+ "'None' input passed when it cannot be none.")
+
+ if not isinstance(input_columns, list):
+ raise ValueError(
+ "input has to be a list of strings, instead got %s" %
+ type(input_columns))
+
+ # validate output
+ if output_columns is None:
+ output_columns = input_columns
+
+ if not isinstance(output_columns, list):
+ raise ValueError(
+ "output has to be a list of strings, instead got %s" %
+ type(output_columns))
+
+ algo_args = dict(
+ column=[
+ dict(
+ Source=i,
+ Name=o) for i,
+ o in zip(
+ input_columns,
+ output_columns)] if input_columns else None,
+ char_array_term_separators=self.char_array_term_separators)
+
+ all_args.update(algo_args)
+ return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py b/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py
new file mode 100644
index 00000000..e7fac07a
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py
@@ -0,0 +1,76 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Transforms.WordTokenizer
+"""
+
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_wordtokenizer(
+ data,
+ output_data=None,
+ model=None,
+ column=None,
+ char_array_term_separators=None,
+ **params):
+ """
+ **Description**
+ The input to this transform is text, and the output is a vector of
+ text containing the words (tokens) in the original text. The
+ separator is space, but can be specified as any other
+ character (or multiple characters) if needed.
+
+ :param column: New column definition(s) (inputs).
+ :param data: Input dataset (inputs).
+ :param char_array_term_separators: Array of single character term
+ separator(s). By default uses space character separator.
+ (inputs).
+ :param output_data: Transformed dataset (outputs).
+ :param model: Transform model (outputs).
+ """
+
+ entrypoint_name = 'Transforms.WordTokenizer'
+ inputs = {}
+ outputs = {}
+
+ if column is not None:
+ inputs['Column'] = try_set(
+ obj=column,
+ none_acceptable=True,
+ is_of_type=list,
+ is_column=True)
+ if data is not None:
+ inputs['Data'] = try_set(
+ obj=data,
+ none_acceptable=False,
+ is_of_type=str)
+ if char_array_term_separators is not None:
+ inputs['CharArrayTermSeparators'] = try_set(
+ obj=char_array_term_separators,
+ none_acceptable=True,
+ is_of_type=list)
+ if output_data is not None:
+ outputs['OutputData'] = try_set(
+ obj=output_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if model is not None:
+ outputs['Model'] = try_set(
+ obj=model,
+ none_acceptable=False,
+ is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/preprocessing/text/__init__.py b/src/python/nimbusml/preprocessing/text/__init__.py
index b255f350..686e7c5a 100644
--- a/src/python/nimbusml/preprocessing/text/__init__.py
+++ b/src/python/nimbusml/preprocessing/text/__init__.py
@@ -1,5 +1,7 @@
from .chartokenizer import CharTokenizer
+from .wordtokenizer import WordTokenizer
__all__ = [
- 'CharTokenizer'
+ 'CharTokenizer',
+ 'WordTokenizer'
]
diff --git a/src/python/nimbusml/preprocessing/text/wordtokenizer.py b/src/python/nimbusml/preprocessing/text/wordtokenizer.py
new file mode 100644
index 00000000..54a0ffe0
--- /dev/null
+++ b/src/python/nimbusml/preprocessing/text/wordtokenizer.py
@@ -0,0 +1,55 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+WordTokenizer
+"""
+
+__all__ = ["WordTokenizer"]
+
+
+from sklearn.base import TransformerMixin
+
+from ...base_transform import BaseTransform
+from ...internal.core.preprocessing.text.wordtokenizer import \
+ WordTokenizer as core
+from ...internal.utils.utils import trace
+
+
+class WordTokenizer(core, BaseTransform, TransformerMixin):
+ """
+ **Description**
+ The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.
+
+ :param columns: see `Columns `_.
+
+ :param char_array_term_separators: Array of single character term
+ separator(s). By default uses space character separator.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ char_array_term_separators=None,
+ columns=None,
+ **params):
+
+ if columns:
+ params['columns'] = columns
+ BaseTransform.__init__(self, **params)
+ core.__init__(
+ self,
+ char_array_term_separators=char_array_term_separators,
+ **params)
+ self._columns = columns
+
+ def get_params(self, deep=False):
+ """
+ Get the parameters for this operator.
+ """
+ return core.get_params(self)
diff --git a/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py b/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py
new file mode 100644
index 00000000..a8c66016
--- /dev/null
+++ b/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py
@@ -0,0 +1,33 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+import unittest
+
+import pandas
+from nimbusml import Pipeline
+from nimbusml.preprocessing.text import WordTokenizer
+
+
+class TestWordTokenizer(unittest.TestCase):
+
+ def test_wordtokenizer(self):
+ customer_reviews = pandas.DataFrame(data=dict(review=[
+ "I really did not like the taste of it",
+ "It was surprisingly quite good!"]))
+
+ tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review'
+ pipeline = Pipeline([tokenize])
+
+ tokenize.fit(customer_reviews)
+ y = tokenize.transform(customer_reviews)
+
+ self.assertEqual(y.shape, (2, 9))
+
+ self.assertEqual(y.loc[0, 'review.3'], 'ot')
+ self.assertEqual(y.loc[1, 'review.3'], 'gly')
+ self.assertEqual(y.loc[1, 'review.6'], None)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py
index eb39246b..e47ce638 100644
--- a/src/python/tests/test_estimator_checks.py
+++ b/src/python/tests/test_estimator_checks.py
@@ -160,7 +160,7 @@
'PixelExtractor, Loader, Resizer, \
GlobalContrastRowScaler, PcaTransformer, '
'ColumnConcatenator, Sentiment, CharTokenizer, LightLda, '
- 'NGramFeaturizer, WordEmbedding, LpScaler',
+ 'NGramFeaturizer, WordEmbedding, LpScaler, WordTokenizer',
'check_transformer_data_not_an_array, check_pipeline_consistency, '
'check_fit2d_1feature, check_estimators_fit_returns_self,\
check_fit2d_1sample, '
diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py
index b2765691..ed829533 100644
--- a/src/python/tools/entrypoint_compiler.py
+++ b/src/python/tools/entrypoint_compiler.py
@@ -1457,7 +1457,7 @@ def parse_arg(argument, inout):
assert not is_column
arg_obj = NumericArrayArg(argument, inout)
elif itemType in ["String", "DataView", "PredictorModel",
- "TransformModel", "Node"]:
+ "TransformModel", "Node", "Char"]:
arg_obj = StringArrayArg(argument, inout,
is_column=is_column)
elif isinstance(itemType, dict):
diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json
index 817dca50..cddfaf25 100644
--- a/src/python/tools/manifest_diff.json
+++ b/src/python/tools/manifest_diff.json
@@ -733,6 +733,12 @@
"Module": "preprocessing.text",
"Type": "Transform"
},
+ {
+ "Name": "Transforms.WordTokenizer",
+ "NewName": "WordTokenizer",
+ "Module": "preprocessing.text",
+ "Type": "Transform"
+ },
{
"Name": "Transforms.LightLda",
"NewName": "LightLda",