From 9f66e3bb0c5f9b917c1b31fd2aee3737ce28660c Mon Sep 17 00:00:00 2001
From: "pieths.dev@gmail.com" <PH Schouten>
Date: Fri, 30 Aug 2019 13:10:47 -0700
Subject: [PATCH 1/8] Initial test of WordTokenizer.

---
 src/DotNetBridge/NativeDataInterop.cs         | 44 +++++++++
 src/python/nimbusml.pyproj                    |  4 +
 .../WordTokenizer_df.py                       | 26 ++++++
 .../core/preprocessing/text/wordtokenizer.py  | 89 +++++++++++++++++++
 .../entrypoints/transforms_wordtokenizer.py   | 76 ++++++++++++++++
 .../nimbusml/preprocessing/text/__init__.py   |  4 +-
 .../preprocessing/text/wordtokenizer.py       | 55 ++++++++++++
 src/python/tools/entrypoint_compiler.py       |  2 +-
 src/python/tools/manifest_diff.json           |  6 ++
 9 files changed, 304 insertions(+), 2 deletions(-)
 create mode 100644 src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py
 create mode 100644 src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py
 create mode 100644 src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py
 create mode 100644 src/python/nimbusml/preprocessing/text/wordtokenizer.py
diff --git a/src/DotNetBridge/NativeDataInterop.cs b/src/DotNetBridge/NativeDataInterop.cs
index c9b70526..9f67595f 100644
--- a/src/DotNetBridge/NativeDataInterop.cs
+++ b/src/DotNetBridge/NativeDataInterop.cs
@@ -93,6 +93,47 @@ public ColumnMetadataInfo(bool expand, string[] slotNames, Dictionary<uint, Read
             }
         }
 
+        private static unsafe void PrintDataView(IDataView view)
+        {
+            var schema = view.Schema;
+            var colIndices = new List<int>();
+
+            for (int col = 0; col < schema.Count; col++)
+            {
+                if (schema[col].IsHidden)
+                    continue;
+
+                colIndices.Add(col);
+            }
+
+            using (var cursor = view.GetRowCursor(view.Schema.Where(col => colIndices.Contains(col.Index))))
+            {
+                var type = schema[colIndices[0]].Type;
+
+                ValueGetter<VBuffer<ReadOnlyMemory<char>>> _getVec =
+                    RowCursorUtils.GetVecGetterAs<ReadOnlyMemory<char>>((PrimitiveDataViewType)type.GetItemType(), cursor, colIndices[0]);
+                VBuffer<ReadOnlyMemory<char>> _buffer = new VBuffer<ReadOnlyMemory<char>>();
+
+                for (int crow = 0; ; crow++)
+                {
+                    // Advance to the next row.
+                    if (!cursor.MoveNext())
+                        break;
+
+                    string rowOutput = "[";
+
+                    _getVec(ref _buffer);
+                    for (int i = 0; i < _buffer.Length; i++)
+                    {
+                        rowOutput += _buffer.GetValues()[i] + ", ";
+                    }
+                    rowOutput += "]";
+
+                    System.Console.WriteLine(rowOutput);
+                }
+            }
+        }
+
         private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, IDataView view, Dictionary<string, ColumnMetadataInfo> infos = null)
         {
             Contracts.AssertValue(ch);
@@ -117,6 +158,9 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv,
             var expandCols = new HashSet<int>();
             var allNames = new HashSet<string>();
 
+            // Only works with WordTokenizer_df.py
+            PrintDataView(view);
+
             for (int col = 0; col < schema.Count; col++)
             {
                 if (schema[col].IsHidden)
diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
index 5daea049..9929408c 100644
--- a/src/python/nimbusml.pyproj
+++ b/src/python/nimbusml.pyproj
@@ -100,6 +100,7 @@
     <Compile Include="nimbusml\examples\examples_from_dataframe\AveragedPerceptronBinaryClassifier_infert_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\Binner_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\BootStrapSample_df.py" />
+    <Compile Include="nimbusml\examples\examples_from_dataframe\WordTokenizer_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\CharTokenizer_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\ColumnConcatenator_df.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\ColumnDuplicator_df.py" />
@@ -299,6 +300,7 @@
     <Compile Include="nimbusml\internal\core\preprocessing\schema\columnduplicator.py" />
     <Compile Include="nimbusml\internal\core\preprocessing\schema\columndropper.py" />
     <Compile Include="nimbusml\internal\core\preprocessing\tensorflowscorer.py" />
+    <Compile Include="nimbusml\internal\core\preprocessing\text\wordtokenizer.py" />
     <Compile Include="nimbusml\internal\core\timeseries\iidchangepointdetector.py" />
     <Compile Include="nimbusml\internal\core\timeseries\iidspikedetector.py" />
     <Compile Include="nimbusml\internal\core\timeseries\ssachangepointdetector.py" />
@@ -441,6 +443,7 @@
     <Compile Include="nimbusml\internal\entrypoints\transforms_twoheterogeneousmodelcombiner.py" />
     <Compile Include="nimbusml\internal\entrypoints\transforms_vectortoimage.py" />
     <Compile Include="nimbusml\internal\entrypoints\transforms_wordembeddings.py" />
+    <Compile Include="nimbusml\internal\entrypoints\transforms_wordtokenizer.py" />
     <Compile Include="nimbusml\internal\entrypoints\_boosterparameterfunction_dart.py" />
     <Compile Include="nimbusml\internal\entrypoints\_boosterparameterfunction_gbdt.py" />
     <Compile Include="nimbusml\internal\entrypoints\_boosterparameterfunction_goss.py" />
@@ -637,6 +640,7 @@
     <Compile Include="nimbusml\preprocessing\schema\__init__.py" />
     <Compile Include="nimbusml\preprocessing\tensorflowscorer.py" />
     <Compile Include="nimbusml\preprocessing\text\chartokenizer.py" />
+    <Compile Include="nimbusml\preprocessing\text\wordtokenizer.py" />
     <Compile Include="nimbusml\preprocessing\text\__init__.py" />
     <Compile Include="nimbusml\preprocessing\tokey.py" />
     <Compile Include="nimbusml\preprocessing\__init__.py" />
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py
new file mode 100644
index 00000000..4a7fa2e2
--- /dev/null
+++ b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py
@@ -0,0 +1,26 @@
+###############################################################################
+# WordTokenizer 
+
+import pandas
+from nimbusml import Pipeline, Role
+from nimbusml.preprocessing.schema import ColumnConcatenator
+from nimbusml.preprocessing.text import WordTokenizer
+
+# create the data
+customer_reviews = pandas.DataFrame(data=dict(review=[
+    "I really did not like the taste of it",
+    "It was surprisingly quite good!",
+    "I will never ever ever go to that place again!!",
+    "The best ever!! It was amazingly good and super fast",
+    "I wish I had gone earlier, it was that great",
+    "somewhat dissapointing. I'd probably wont try again",
+    "Never visit again... rascals!"]))
+
+tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << ['review']
+
+pipeline = Pipeline([tokenize])
+
+tokenize.fit(customer_reviews)
+y = tokenize.transform(customer_reviews)
+
+print(y)
diff --git a/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py b/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py
new file mode 100644
index 00000000..66e06176
--- /dev/null
+++ b/src/python/nimbusml/internal/core/preprocessing/text/wordtokenizer.py
@@ -0,0 +1,89 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+WordTokenizer
+"""
+
+__all__ = ["WordTokenizer"]
+
+
+from ....entrypoints.transforms_wordtokenizer import transforms_wordtokenizer
+from ....utils.utils import trace
+from ...base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class WordTokenizer(BasePipelineItem, DefaultSignature):
+    """
+    **Description**
+        The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.
+
+    :param char_array_term_separators: Array of single character term
+        separator(s). By default uses space character separator.
+
+    :param params: Additional arguments sent to compute engine.
+
+    """
+
+    @trace
+    def __init__(
+            self,
+            char_array_term_separators=None,
+            **params):
+        BasePipelineItem.__init__(
+            self, type='transform', **params)
+
+        self.char_array_term_separators = char_array_term_separators
+
+    @property
+    def _entrypoint(self):
+        return transforms_wordtokenizer
+
+    @trace
+    def _get_node(self, **all_args):
+
+        input_columns = self.input
+        if input_columns is None and 'input' in all_args:
+            input_columns = all_args['input']
+        if 'input' in all_args:
+            all_args.pop('input')
+
+        output_columns = self.output
+        if output_columns is None and 'output' in all_args:
+            output_columns = all_args['output']
+        if 'output' in all_args:
+            all_args.pop('output')
+
+        # validate input
+        if input_columns is None:
+            raise ValueError(
+                "'None' input passed when it cannot be none.")
+
+        if not isinstance(input_columns, list):
+            raise ValueError(
+                "input has to be a list of strings, instead got %s" %
+                type(input_columns))
+
+        # validate output
+        if output_columns is None:
+            output_columns = input_columns
+
+        if not isinstance(output_columns, list):
+            raise ValueError(
+                "output has to be a list of strings, instead got %s" %
+                type(output_columns))
+
+        algo_args = dict(
+            column=[
+                dict(
+                    Source=i,
+                    Name=o) for i,
+                o in zip(
+                    input_columns,
+                    output_columns)] if input_columns else None,
+            char_array_term_separators=self.char_array_term_separators)
+
+        all_args.update(algo_args)
+        return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py b/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py
new file mode 100644
index 00000000..e7fac07a
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py
@@ -0,0 +1,76 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Transforms.WordTokenizer
+"""
+
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_wordtokenizer(
+        data,
+        output_data=None,
+        model=None,
+        column=None,
+        char_array_term_separators=None,
+        **params):
+    """
+    **Description**
+        The input to this transform is text, and the output is a vector of
+        text containing the words (tokens) in the original text. The
+        separator is space, but can be specified as any other
+        character (or multiple characters) if needed.
+
+    :param column: New column definition(s) (inputs).
+    :param data: Input dataset (inputs).
+    :param char_array_term_separators: Array of single character term
+        separator(s). By default uses space character separator.
+        (inputs).
+    :param output_data: Transformed dataset (outputs).
+    :param model: Transform model (outputs).
+    """
+
+    entrypoint_name = 'Transforms.WordTokenizer'
+    inputs = {}
+    outputs = {}
+
+    if column is not None:
+        inputs['Column'] = try_set(
+            obj=column,
+            none_acceptable=True,
+            is_of_type=list,
+            is_column=True)
+    if data is not None:
+        inputs['Data'] = try_set(
+            obj=data,
+            none_acceptable=False,
+            is_of_type=str)
+    if char_array_term_separators is not None:
+        inputs['CharArrayTermSeparators'] = try_set(
+            obj=char_array_term_separators,
+            none_acceptable=True,
+            is_of_type=list)
+    if output_data is not None:
+        outputs['OutputData'] = try_set(
+            obj=output_data,
+            none_acceptable=False,
+            is_of_type=str)
+    if model is not None:
+        outputs['Model'] = try_set(
+            obj=model,
+            none_acceptable=False,
+            is_of_type=str)
+
+    input_variables = {
+        x for x in unlist(inputs.values())
+        if isinstance(x, str) and x.startswith("$")}
+    output_variables = {
+        x for x in unlist(outputs.values())
+        if isinstance(x, str) and x.startswith("$")}
+
+    entrypoint = EntryPoint(
+        name=entrypoint_name, inputs=inputs, outputs=outputs,
+        input_variables=input_variables,
+        output_variables=output_variables)
+    return entrypoint
diff --git a/src/python/nimbusml/preprocessing/text/__init__.py b/src/python/nimbusml/preprocessing/text/__init__.py
index b255f350..686e7c5a 100644
--- a/src/python/nimbusml/preprocessing/text/__init__.py
+++ b/src/python/nimbusml/preprocessing/text/__init__.py
@@ -1,5 +1,7 @@
 from .chartokenizer import CharTokenizer
+from .wordtokenizer import WordTokenizer
 
 __all__ = [
-    'CharTokenizer'
+    'CharTokenizer',
+    'WordTokenizer'
 ]
diff --git a/src/python/nimbusml/preprocessing/text/wordtokenizer.py b/src/python/nimbusml/preprocessing/text/wordtokenizer.py
new file mode 100644
index 00000000..54a0ffe0
--- /dev/null
+++ b/src/python/nimbusml/preprocessing/text/wordtokenizer.py
@@ -0,0 +1,55 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+WordTokenizer
+"""
+
+__all__ = ["WordTokenizer"]
+
+
+from sklearn.base import TransformerMixin
+
+from ...base_transform import BaseTransform
+from ...internal.core.preprocessing.text.wordtokenizer import \
+    WordTokenizer as core
+from ...internal.utils.utils import trace
+
+
+class WordTokenizer(core, BaseTransform, TransformerMixin):
+    """
+    **Description**
+        The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed.
+
+    :param columns: see `Columns </nimbusml/concepts/columns>`_.
+
+    :param char_array_term_separators: Array of single character term
+        separator(s). By default uses space character separator.
+
+    :param params: Additional arguments sent to compute engine.
+
+    """
+
+    @trace
+    def __init__(
+            self,
+            char_array_term_separators=None,
+            columns=None,
+            **params):
+
+        if columns:
+            params['columns'] = columns
+        BaseTransform.__init__(self, **params)
+        core.__init__(
+            self,
+            char_array_term_separators=char_array_term_separators,
+            **params)
+        self._columns = columns
+
+    def get_params(self, deep=False):
+        """
+        Get the parameters for this operator.
+        """
+        return core.get_params(self)
diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py
index b2765691..ed829533 100644
--- a/src/python/tools/entrypoint_compiler.py
+++ b/src/python/tools/entrypoint_compiler.py
@@ -1457,7 +1457,7 @@ def parse_arg(argument, inout):
                 assert not is_column
                 arg_obj = NumericArrayArg(argument, inout)
             elif itemType in ["String", "DataView", "PredictorModel",
-                              "TransformModel", "Node"]:
+                              "TransformModel", "Node", "Char"]:
                 arg_obj = StringArrayArg(argument, inout,
                                          is_column=is_column)
             elif isinstance(itemType, dict):
diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json
index d8a64d82..89e6cf7f 100644
--- a/src/python/tools/manifest_diff.json
+++ b/src/python/tools/manifest_diff.json
@@ -720,6 +720,12 @@
       "Module": "preprocessing.text",
       "Type": "Transform"
     },
+    {
+      "Name": "Transforms.WordTokenizer",
+      "NewName": "WordTokenizer",
+      "Module": "preprocessing.text",
+      "Type": "Transform"
+    },
     {
       "Name": "Transforms.LightLda",
       "NewName": "LightLda",

From a0e364092117ee878284d3974ba53cb896ff36cd Mon Sep 17 00:00:00 2001
From: "pieths.dev@gmail.com" <PH Schouten>
Date: Thu, 3 Oct 2019 10:12:59 -0700
Subject: [PATCH 2/8] Remove debug code that accidentally made it througth the
 merge.

---
 src/DotNetBridge/NativeDataInterop.cs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/DotNetBridge/NativeDataInterop.cs b/src/DotNetBridge/NativeDataInterop.cs
index d1e43814..b7f1a762 100644
--- a/src/DotNetBridge/NativeDataInterop.cs
+++ b/src/DotNetBridge/NativeDataInterop.cs
@@ -122,9 +122,6 @@ private static unsafe void SendViewToNativeAsDataFrame(IChannel ch, EnvironmentB
             var expandCols = new HashSet<int>(1000);
             var valueCounts = new List<byte>(1000);
 
-            // Only works with WordTokenizer_df.py
-            PrintDataView(view);
-
             for (int col = 0; col < schema.Count; col++)
             {
                 if (schema[col].IsHidden)

From 414fb4b391fbb900f6eac71c90d5d25975ddb92a Mon Sep 17 00:00:00 2001
From: "pieths.dev@gmail.com" <PH Schouten>
Date: Thu, 3 Oct 2019 10:30:09 -0700
Subject: [PATCH 3/8] Update the WordTokenizer_df example.

---
 .../examples_from_dataframe/WordTokenizer_df.py        | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py
index 4a7fa2e2..8c0e2362 100644
--- a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py
+++ b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py
@@ -16,7 +16,7 @@
     "somewhat dissapointing. I'd probably wont try again",
     "Never visit again... rascals!"]))
 
-tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << ['review']
+tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review'
 
 pipeline = Pipeline([tokenize])
 
@@ -24,3 +24,11 @@
 y = tokenize.transform(customer_reviews)
 
 print(y)
+#   review.00 review.01 review.02 review.03 review.04 review.05 review.06 review.07 review.08 review.09 review.10 review.11
+# 0         I    really       did        ot      like       the     taste        of        it      None      None      None
+# 1        It       was  surprisi       gly     quite     good!      None      None      None      None      None      None
+# 2         I      will      ever      ever      ever        go        to      that     place      agai        !!      None
+# 3       The      best    ever!!        It       was     amazi       gly      good         a         d     super      fast
+# 4         I      wish         I       had        go         e  earlier,        it       was      that     great      None
+# 5  somewhat  dissapoi        ti        g.       I'd  probably        wo         t       try      agai      None      None
+# 6     Never     visit      agai       ...  rascals!      None      None      None      None      None      None      None

From cc99e17e855162c907089e3f9be799fc5cf79d09 Mon Sep 17 00:00:00 2001
From: "pieths.dev@gmail.com" <PH Schouten>
Date: Thu, 3 Oct 2019 10:45:37 -0700
Subject: [PATCH 4/8] Remove unnecessary import from WordTokenizer_df.

---
 .../examples/examples_from_dataframe/WordTokenizer_df.py       | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py
index 8c0e2362..31980567 100644
--- a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py
+++ b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py
@@ -2,8 +2,7 @@
 # WordTokenizer 
 
 import pandas
-from nimbusml import Pipeline, Role
-from nimbusml.preprocessing.schema import ColumnConcatenator
+from nimbusml import Pipeline
 from nimbusml.preprocessing.text import WordTokenizer
 
 # create the data

From 68c739e0bed7b22c03989136a53058f7ad0540df Mon Sep 17 00:00:00 2001
From: "pieths.dev@gmail.com" <PH Schouten>
Date: Thu, 3 Oct 2019 10:47:28 -0700
Subject: [PATCH 5/8] Add WordTokenizer example.

---
 src/python/nimbusml.pyproj                    |  1 +
 src/python/nimbusml/examples/WordTokenizer.py | 32 +++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 src/python/nimbusml/examples/WordTokenizer.py

diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
index dbb24f72..1f9cab3a 100644
--- a/src/python/nimbusml.pyproj
+++ b/src/python/nimbusml.pyproj
@@ -92,6 +92,7 @@
     <Compile Include="nimbusml\examples\CharTokenizer.py" />
     <Compile Include="nimbusml\examples\ColumnConcatenator.py" />
     <Compile Include="nimbusml\examples\examples_from_dataframe\PrefixColumnConcatenator_df.py" />
+    <Compile Include="nimbusml\examples\WordTokenizer.py" />
     <Compile Include="nimbusml\examples\PrefixColumnConcatenator.py" />
     <Compile Include="nimbusml\examples\ColumnDropper.py" />
     <Compile Include="nimbusml\examples\ColumnDuplicator.py" />
diff --git a/src/python/nimbusml/examples/WordTokenizer.py b/src/python/nimbusml/examples/WordTokenizer.py
new file mode 100644
index 00000000..0f4cc76a
--- /dev/null
+++ b/src/python/nimbusml/examples/WordTokenizer.py
@@ -0,0 +1,32 @@
+###############################################################################
+# WordTokenizer
+
+from nimbusml import Pipeline, FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.preprocessing.text import WordTokenizer
+
+# data input (as a FileDataStream)
+path = get_dataset("wiki_detox_train").as_filepath()
+
+data = FileDataStream.read_csv(path, sep='\t')
+print(data.head())
+#   Sentiment                                      SentimentText
+# 0          1  ==RUDE== Dude, you are rude upload that carl p...
+# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
+# 2          1  Stop trolling, zapatancas, calling me a liar m...
+# 3          1  ==You're cool==  You seem like a really cool g...
+# 4          1  ::::: Why are you threatening me? I'm not bein...
+
+tokenize = WordTokenizer(char_array_term_separators=[" "]) << {'wt': 'SentimentText'}
+pipeline = Pipeline([tokenize])
+
+tokenize.fit(data)
+y = tokenize.transform(data)
+
+print(y.drop(labels='SentimentText', axis=1).head())
+#    Sentiment    wt.000     wt.001       wt.002   wt.003       wt.004  wt.005  ... wt.366 wt.367 wt.368 wt.369 wt.370 wt.371 wt.372
+# 0          1  ==RUDE==      Dude,          you      are         rude  upload  ...   None   None   None   None   None   None   None
+# 1          1        ==        OK!           ==       IM        GOING      TO  ...   None   None   None   None   None   None   None
+# 2          1      Stop  trolling,  zapatancas,  calling           me       a  ...   None   None   None   None   None   None   None
+# 3          1  ==You're     cool==          You     seem         like       a  ...   None   None   None   None   None   None   None
+# 4          1     :::::        Why          are      you  threatening     me?  ...   None   None   None   None   None   None   None

From 1169fe055824f31710def283924489cfdd38ffd8 Mon Sep 17 00:00:00 2001
From: "pieths.dev@gmail.com" <PH Schouten>
Date: Thu, 3 Oct 2019 11:03:22 -0700
Subject: [PATCH 6/8] Add initial unit test for WordTokenizer.

---
 src/python/nimbusml.pyproj                    |  1 +
 .../preprocessing/text/test_wordtokenizer.py  | 33 +++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py

diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
index 1f9cab3a..f21bc0c3 100644
--- a/src/python/nimbusml.pyproj
+++ b/src/python/nimbusml.pyproj
@@ -690,6 +690,7 @@
     <Compile Include="nimbusml\tests\preprocessing\normalization\test_meanvariancescaler.py" />
     <Compile Include="nimbusml\tests\preprocessing\schema\test_prefixcolumnconcatenator.py" />
     <Compile Include="nimbusml\tests\preprocessing\test_datasettransformer.py" />
+    <Compile Include="nimbusml\tests\preprocessing\text\test_wordtokenizer.py" />
     <Compile Include="nimbusml\tests\test_csr_matrix_output.py" />
     <Compile Include="nimbusml\tests\test_variable_column.py" />
     <Compile Include="nimbusml\tests\timeseries\test_iidchangepointdetector.py" />
diff --git a/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py b/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py
new file mode 100644
index 00000000..a8c66016
--- /dev/null
+++ b/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py
@@ -0,0 +1,33 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+import unittest
+
+import pandas
+from nimbusml import Pipeline
+from nimbusml.preprocessing.text import WordTokenizer
+
+
+class TestWordTokenizer(unittest.TestCase):
+
+    def test_wordtokenizer(self):
+        customer_reviews = pandas.DataFrame(data=dict(review=[
+            "I really did not like the taste of it",
+            "It was surprisingly quite good!"]))
+
+        tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review'
+        pipeline = Pipeline([tokenize])
+
+        tokenize.fit(customer_reviews)
+        y = tokenize.transform(customer_reviews)
+
+        self.assertEqual(y.shape, (2, 9))
+
+        self.assertEqual(y.loc[0, 'review.3'], 'ot')
+        self.assertEqual(y.loc[1, 'review.3'], 'gly')
+        self.assertEqual(y.loc[1, 'review.6'], None)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 954e8b6a05c0861964baad1e4065b89f8b421594 Mon Sep 17 00:00:00 2001
From: "pieths.dev@gmail.com" <PH Schouten>
Date: Thu, 3 Oct 2019 11:15:22 -0700
Subject: [PATCH 7/8] Excluded WordTokenizer from most tests in
 test_estimator_checks.

---
 src/python/tests/test_estimator_checks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py
index eb39246b..e47ce638 100644
--- a/src/python/tests/test_estimator_checks.py
+++ b/src/python/tests/test_estimator_checks.py
@@ -160,7 +160,7 @@
     'PixelExtractor, Loader, Resizer, \
                         GlobalContrastRowScaler, PcaTransformer, '
     'ColumnConcatenator, Sentiment, CharTokenizer, LightLda, '
-    'NGramFeaturizer, WordEmbedding, LpScaler',
+    'NGramFeaturizer, WordEmbedding, LpScaler, WordTokenizer',
     'check_transformer_data_not_an_array, check_pipeline_consistency, '
     'check_fit2d_1feature, check_estimators_fit_returns_self,\
                        check_fit2d_1sample, '

From 10a384d3847b19d8baec7f4a2de3b211046f611d Mon Sep 17 00:00:00 2001
From: "pieths.dev@gmail.com" <PH Schouten>
Date: Thu, 3 Oct 2019 14:02:15 -0700
Subject: [PATCH 8/8] Whitespace change to restart ci run. Mac run lost
 communication.

---
 src/python/nimbusml/examples/WordTokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/python/nimbusml/examples/WordTokenizer.py b/src/python/nimbusml/examples/WordTokenizer.py
index 0f4cc76a..028d5d7e 100644
--- a/src/python/nimbusml/examples/WordTokenizer.py
+++ b/src/python/nimbusml/examples/WordTokenizer.py
@@ -1,5 +1,5 @@
 ###############################################################################
-# WordTokenizer
+# WordTokenizer 
 
 from nimbusml import Pipeline, FileDataStream
 from nimbusml.datasets import get_dataset