From 19b0d1b44d4eebe933831f458550808a46c5ea94 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 27 Mar 2019 15:59:28 -0700 Subject: [PATCH 1/4] Created sample for 'TokenizeIntoCharactersAsKeys' API. --- .../Transforms/Text/TokenizeIntoCharacters.cs | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharacters.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharacters.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharacters.cs new file mode 100644 index 0000000000..432e88157b --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharacters.cs @@ -0,0 +1,61 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class TokenizeIntoCharacters + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create an empty data sample list. The 'TokenizeIntoCharactersAsKeys' does not require training data as + // the estimator ('TokenizingByCharactersEstimator') created by 'TokenizeIntoCharactersAsKeys' API is not a trainable estimator. + // The empty list is only needed to pass input schema to the pipeline. + var samples = new List(); + + // Convert sample list to an empty IDataView. + var dataview = mlContext.Data.LoadFromEnumerable(samples); + + // A pipeline for converting text into vector of characters. + // The 'TokenizeIntoCharactersAsKeys' produces result as key type. + // 'MapKeyToValue' is need to map keys back to their original values. + var textPipeline = mlContext.Transforms.Text.TokenizeIntoCharactersAsKeys("CharTokens", "Text", useMarkerCharacters: false) + .Append(mlContext.Transforms.Conversion.MapKeyToValue("CharTokens")); + + // Fit to data. + var textTransformer = textPipeline.Fit(dataview); + + // Create the prediction engine to get the character vector from the input text/string. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Call the prediction API to convert the text into characters. + var data = new TextData() { Text = "ML.NET's TokenizeIntoCharactersAsKeys API splits text/string into characters." }; + var prediction = predictionEngine.Predict(data); + + // Print the length of the character vector. + Console.WriteLine($"Number of tokens: {prediction.CharTokens.Length}"); + + // Print the character vector. + Console.WriteLine($"\nCharacter Tokens: {string.Join(",", prediction.CharTokens)}"); + + // Expected output: + // Number of tokens: 112 + // Character Tokens: M,L,.,N,E,T,',s,,T,o,k,e,n,i,z,e,I,n,t,o,C,h,a,r,a,c,t,e,r,s,A,s,K,e,y,s,,A,P,I,, + // s,p,l,i,t,s,,t,e,x,t,/,s,t,r,i,n,g,,i,n,t,o,,c,h,a,r,a,c,t,e,r,s,. + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public string[] CharTokens { get; set; } + } + } +} From f4166a7fcbcdfed00fbb9f32d6d8b424210a87d9 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 27 Mar 2019 16:02:15 -0700 Subject: [PATCH 2/4] Updated the catalog. --- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index e2baf1578e..1b6bb0b084 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -57,6 +57,13 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text /// Name of the column to transform. If set to , the value of the will be used as source. /// Whether to prepend a marker character, , to the beginning, /// and append another marker character, , to the end of the output vector of characters. + /// + /// + /// + /// + /// public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, From 428d833c51750ac206f4bf79782f62f27ccfc734 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Thu, 28 Mar 2019 13:42:30 -0700 Subject: [PATCH 3/4] Addressed reviewers' comments. --- .../Dynamic/Transforms/Text/TokenizeIntoCharacters.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharacters.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharacters.cs index 432e88157b..6a3c29dbc9 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharacters.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharacters.cs @@ -46,6 +46,8 @@ public static void Example() // Number of tokens: 112 // Character Tokens: M,L,.,N,E,T,',s,,T,o,k,e,n,i,z,e,I,n,t,o,C,h,a,r,a,c,t,e,r,s,A,s,K,e,y,s,,A,P,I,, // s,p,l,i,t,s,,t,e,x,t,/,s,t,r,i,n,g,,i,n,t,o,,c,h,a,r,a,c,t,e,r,s,. + // + // : is a unicode control character used instead of spaces ('\u2400'). } public class TextData From 04f92d54138ff9e57ba07dbf2de8ca2279fce8e9 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Thu, 28 Mar 2019 15:03:07 -0700 Subject: [PATCH 4/4] Addressed reviewers' comments. --- ...toCharacters.cs => TokenizeIntoCharactersAsKeys.cs} | 10 +++++----- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) rename docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/{TokenizeIntoCharacters.cs => TokenizeIntoCharactersAsKeys.cs} (90%) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharacters.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs similarity index 90% rename from docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharacters.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs index 6a3c29dbc9..9c443b459a 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharacters.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs @@ -4,7 +4,7 @@ namespace Microsoft.ML.Samples.Dynamic { - public static class TokenizeIntoCharacters + public static class TokenizeIntoCharactersAsKeys { public static void Example() { @@ -15,10 +15,10 @@ public static void Example() // Create an empty data sample list. The 'TokenizeIntoCharactersAsKeys' does not require training data as // the estimator ('TokenizingByCharactersEstimator') created by 'TokenizeIntoCharactersAsKeys' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. - var samples = new List(); + var emptySamples = new List(); // Convert sample list to an empty IDataView. - var dataview = mlContext.Data.LoadFromEnumerable(samples); + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); // A pipeline for converting text into vector of characters. // The 'TokenizeIntoCharactersAsKeys' produces result as key type. @@ -27,7 +27,7 @@ public static void Example() .Append(mlContext.Transforms.Conversion.MapKeyToValue("CharTokens")); // Fit to data. - var textTransformer = textPipeline.Fit(dataview); + var textTransformer = textPipeline.Fit(emptyDataView); // Create the prediction engine to get the character vector from the input text/string. var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); @@ -43,7 +43,7 @@ public static void Example() Console.WriteLine($"\nCharacter Tokens: {string.Join(",", prediction.CharTokens)}"); // Expected output: - // Number of tokens: 112 + // Number of tokens: 77 // Character Tokens: M,L,.,N,E,T,',s,,T,o,k,e,n,i,z,e,I,n,t,o,C,h,a,r,a,c,t,e,r,s,A,s,K,e,y,s,,A,P,I,, // s,p,l,i,t,s,,t,e,x,t,/,s,t,r,i,n,g,,i,n,t,o,,c,h,a,r,a,c,t,e,r,s,. // diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 1b6bb0b084..e348e5faa3 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -60,7 +60,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text /// /// /// /// ///