diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs deleted file mode 100644 index 134a3ca3a0..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs +++ /dev/null @@ -1,82 +0,0 @@ -using System; -using System.Collections.Generic; -using Microsoft.ML.Data; - -namespace Microsoft.ML.Samples.Dynamic -{ - public static class StopWordRemoverTransform - { - public static void Example() - { - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var ml = new MLContext(); - - // Get a small dataset as an IEnumerable and convert to IDataView. - var data = SamplesUtils.DatasetUtils.GetSentimentData(); - var trainData = ml.Data.LoadFromEnumerable(data); - - // Preview of the data. - // - // Sentiment SentimentText - // true Best game I've ever played. - // false ==RUDE== Dude, 2. - // true Until the next game, this is the best Xbox game! - - // Let's take SentimentText column and break it into vector of words. - string originalTextColumnName = "Words"; - var words = ml.Transforms.Text.TokenizeIntoWords("SentimentText", originalTextColumnName); - - // Default pipeline will apply default stop word remover which is based on predifined set of words for certain languages. - var defaultPipeline = words.Append(ml.Transforms.Text.RemoveDefaultStopWords(originalTextColumnName, "DefaultRemover")); - - // Another pipeline, that removes words specified by user. We do case insensitive comparison for the stop words. - var customizedPipeline = words.Append(ml.Transforms.Text.RemoveStopWords(originalTextColumnName, "RemovedWords", - new[] { "XBOX" })); - - // The transformed data for both pipelines. - var transformedDataDefault = defaultPipeline.Fit(trainData).Transform(trainData); - var transformedDataCustomized = customizedPipeline.Fit(trainData).Transform(trainData); - - // Small helper to print the text inside the columns, in the console. - Action>>> printHelper = (columnName, column) => - { - Console.WriteLine($"{columnName} column obtained post-transformation."); - foreach (var featureRow in column) - { - foreach (var value in featureRow.GetValues()) - Console.Write($"{value}|"); - Console.WriteLine(""); - } - - Console.WriteLine("==================================================="); - }; - - // Preview the result of breaking string into array of words. - var originalText = transformedDataDefault.GetColumn>>(transformedDataDefault.Schema[originalTextColumnName]); - printHelper(originalTextColumnName, originalText); - // Best|game|I've|ever|played.| - // == RUDE ==| Dude,| 2 | - // Until | the | next | game,| this |is| the | best | Xbox | game!| - - // Preview the result of cleaning with default stop word remover. - var defaultRemoverData = transformedDataDefault.GetColumn>>(transformedDataDefault.Schema["DefaultRemover"]); - printHelper("DefaultRemover", defaultRemoverData); - // Best|game|I've|played.| - // == RUDE ==| Dude,| 2 | - // game,| best | Xbox | game!| - // As you can see "Until, the, next, this, is" was removed. - - - // Preview the result of cleaning with default customized stop word remover. - var customizeRemoverData = transformedDataCustomized.GetColumn>>(transformedDataCustomized.Schema["RemovedWords"]); - printHelper("RemovedWords", customizeRemoverData); - - // Best|game|I've|ever|played.| - // == RUDE ==| Dude,| 2 | - // Until | the | next | game,| this |is| the | best | game!| - //As you can see Xbox was removed. - - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs index 4ac4ab18da..c3ee04dbbe 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs @@ -13,7 +13,7 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); - // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as + // Create an empty list as the dataset. The 'ApplyWordEmbedding' does not require training data as // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. var emptySamples = new List(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs index 0a58a2da07..c1a62e21f5 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs @@ -12,7 +12,7 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); - // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as + // Create an empty list as the dataset. The 'ApplyWordEmbedding' does not require training data as // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. var emptySamples = new List(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs index 920ea4353c..3fa83cf3ca 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs @@ -12,7 +12,7 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); - // Create an empty data sample list. The 'NormalizeText' API does not require training data as + // Create an empty list as the dataset. The 'NormalizeText' API does not require training data as // the estimator ('TextNormalizingEstimator') created by 'NormalizeText' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. var emptySamples = new List(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs new file mode 100644 index 0000000000..ddd5a56750 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs @@ -0,0 +1,60 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Transforms.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class RemoveDefaultStopWords + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create an empty list as the dataset. The 'RemoveDefaultStopWords' does not require training data as + // the estimator ('StopWordsRemovingEstimator') created by 'RemoveDefaultStopWords' API is not a trainable estimator. + // The empty list is only needed to pass input schema to the pipeline. + var emptySamples = new List(); + + // Convert sample list to an empty IDataView. + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); + + // A pipeline for removing stop words from input text/string. + // The pipeline first tokenizes text into words then removes stop words. + // The 'RemoveDefaultStopWords' API ignores casing of the text/string e.g. 'tHe' and 'the' are considered the same stop words. + var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text") + .Append(mlContext.Transforms.Text.RemoveDefaultStopWords("WordsWithoutStopWords", "Words", language: StopWordsRemovingEstimator.Language.English)); + + // Fit to data. + var textTransformer = textPipeline.Fit(emptyDataView); + + // Create the prediction engine to remove the stop words from the input text/string. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Call the prediction API to remove stop words. + var data = new TextData() { Text = "ML.NET's RemoveDefaultStopWords API removes stop words from tHe text/string. It requires the text/string to be tokenized beforehand." }; + var prediction = predictionEngine.Predict(data); + + // Print the length of the word vector after the stop words removed. + Console.WriteLine($"Number of words: {prediction.WordsWithoutStopWords.Length}"); + + // Print the word vector without stop words. + Console.WriteLine($"\nWords without stop words: {string.Join(",", prediction.WordsWithoutStopWords)}"); + + // Expected output: + // Number of words: 11 + // Words without stop words: ML.NET's,RemoveDefaultStopWords,API,removes,stop,words,text/string.,requires,text/string,tokenized,beforehand. + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public string[] WordsWithoutStopWords { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs new file mode 100644 index 0000000000..a412920496 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs @@ -0,0 +1,60 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Transforms.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class RemoveStopWords + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create an empty list as the dataset. The 'RemoveStopWords' does not require training data as + // the estimator ('CustomStopWordsRemovingEstimator') created by 'RemoveStopWords' API is not a trainable estimator. + // The empty list is only needed to pass input schema to the pipeline. + var emptySamples = new List(); + + // Convert sample list to an empty IDataView. + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); + + // A pipeline for removing stop words from input text/string. + // The pipeline first tokenizes text into words then removes stop words. + // The 'RemoveStopWords' API ignores casing of the text/string e.g. 'tHe' and 'the' are considered the same stop words. + var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text") + .Append(mlContext.Transforms.Text.RemoveStopWords("WordsWithoutStopWords", "Words", stopwords: new[] { "a", "the", "from", "by" })); + + // Fit to data. + var textTransformer = textPipeline.Fit(emptyDataView); + + // Create the prediction engine to remove the stop words from the input text/string. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Call the prediction API to remove stop words. + var data = new TextData() { Text = "ML.NET's RemoveStopWords API removes stop words from tHe text/string using a list of stop words provided by the user." }; + var prediction = predictionEngine.Predict(data); + + // Print the length of the word vector after the stop words removed. + Console.WriteLine($"Number of words: {prediction.WordsWithoutStopWords.Length}"); + + // Print the word vector without stop words. + Console.WriteLine($"\nWords without stop words: {string.Join(",", prediction.WordsWithoutStopWords)}"); + + // Expected output: + // Number of words: 14 + // Words without stop words: ML.NET's,RemoveStopWords,API,removes,stop,words,text/string,using,list,of,stop,words,provided,user. + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public string[] WordsWithoutStopWords { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs index 9c443b459a..922269d222 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs @@ -12,7 +12,7 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); - // Create an empty data sample list. The 'TokenizeIntoCharactersAsKeys' does not require training data as + // Create an empty list as the dataset. The 'TokenizeIntoCharactersAsKeys' does not require training data as // the estimator ('TokenizingByCharactersEstimator') created by 'TokenizeIntoCharactersAsKeys' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. var emptySamples = new List(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs new file mode 100644 index 0000000000..1f98bd5a21 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs @@ -0,0 +1,59 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class TokenizeIntoWords + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create an empty list as the dataset. The 'TokenizeIntoWords' does not require training data as + // the estimator ('WordTokenizingEstimator') created by 'TokenizeIntoWords' API is not a trainable estimator. + // The empty list is only needed to pass input schema to the pipeline. + var emptySamples = new List(); + + // Convert sample list to an empty IDataView. + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); + + // A pipeline for converting text into vector of words. + // The following call to 'TokenizeIntoWords' tokenizes text/string into words using space as a separator. + // Space is also a default value for the 'separators' argument if it is not specified. + var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text", separators: new[] { ' ' }); + + // Fit to data. + var textTransformer = textPipeline.Fit(emptyDataView); + + // Create the prediction engine to get the word vector from the input text/string. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Call the prediction API to convert the text into words. + var data = new TextData() { Text = "ML.NET's TokenizeIntoWords API splits text/string into words using the list of characters provided as separators." }; + var prediction = predictionEngine.Predict(data); + + // Print the length of the word vector. + Console.WriteLine($"Number of words: {prediction.Words.Length}"); + + // Print the word vector. + Console.WriteLine($"\nWords: {string.Join(",", prediction.Words)}"); + + // Expected output: + // Number of words: 15 + // Words: ML.NET's,TokenizeIntoWords,API,splits,text/string,into,words,using,the,list,of,characters,provided,as,separators. + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public string[] Words { get; set; } + } + } +} diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 86dafb8807..db412be77c 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -186,6 +186,13 @@ internal static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. /// The separators to use (uses space character by default). + /// + /// + /// + /// + /// public static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, @@ -254,8 +261,9 @@ internal static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Te /// /// /// + /// [!code-csharp[RemoveDefaultStopWords](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs)] + /// ]]> + /// /// public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsCatalog.TextTransforms catalog, string outputColumnName, @@ -274,8 +282,9 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC /// /// /// + /// [!code-csharp[RemoveStopWords](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs)] + /// ]]> + /// /// public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCatalog.TextTransforms catalog, string outputColumnName,