Skip to content
2 changes: 1 addition & 1 deletion docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public static void Example()
KeepPunctuations = false,
KeepNumbers = false,
OutputTokens = true,
TextLanguage = TextFeaturizingEstimator.Language.English, // supports English, French, German, Dutch, Italian, Spanish, Japanese
Language = TextFeaturizingEstimator.Language.English, // supports English, French, German, Dutch, Italian, Spanish, Japanese
}, "SentimentText");

// The transformed data for both pipelines.
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ internal static class TextAnalytics
Desc = TextFeaturizingEstimator.Summary,
UserName = TextFeaturizingEstimator.UserName,
ShortName = TextFeaturizingEstimator.LoaderSignature)]
public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env, TextFeaturizingEstimator.Arguments input)
public static CommonOutputs.TransformOutput TextTransform(IHostEnvironment env, TextFeaturizingEstimator.Options input)
{
var h = EntryPointUtils.CheckArgsAndCreateHost(env, "FeaturizeTextEstimator", input);
var xf = TextFeaturizingEstimator.Create(h, input, input.Data);
Expand Down
156 changes: 77 additions & 79 deletions src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
using Microsoft.ML.Runtime;
using Microsoft.ML.Transforms.Text;

[assembly: LoadableClass(TextFeaturizingEstimator.Summary, typeof(IDataTransform), typeof(TextFeaturizingEstimator), typeof(TextFeaturizingEstimator.Arguments), typeof(SignatureDataTransform),
[assembly: LoadableClass(TextFeaturizingEstimator.Summary, typeof(IDataTransform), typeof(TextFeaturizingEstimator), typeof(TextFeaturizingEstimator.Options), typeof(SignatureDataTransform),
TextFeaturizingEstimator.UserName, "TextTransform", TextFeaturizingEstimator.LoaderSignature)]

[assembly: LoadableClass(TextFeaturizingEstimator.Summary, typeof(ITransformer), typeof(TextFeaturizingEstimator), null, typeof(SignatureLoadModel),
Expand Down Expand Up @@ -86,12 +86,12 @@ internal bool TryUnparse(StringBuilder sb)
}

/// <summary>
/// This class exposes <see cref="NgramExtractorTransform"/>/<see cref="NgramHashExtractingTransformer"/> arguments.
/// Advanced options for the <see cref="TextFeaturizingEstimator"/>.
/// </summary>
internal sealed class Arguments : TransformInputBase
public sealed class Options : TransformInputBase
{
[Argument(ArgumentType.Required, HelpText = "New column definition (optional form: name:srcs).", Name = "Column", ShortName = "col", SortOrder = 1)]
public Column Columns;
internal Column Columns;

[Argument(ArgumentType.AtMostOnce, HelpText = "Dataset language or 'AutoDetect' to detect language per row.", ShortName = "lang", SortOrder = 3)]
public Language Language = DefaultLanguage;
Copy link
Contributor

@Ivanidzo4ka Ivanidzo4ka Mar 11, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Language [](start = 28, length = 8)

Not related to your PR, but whole point of this thing is to set or autodetect language, we left autodetect outside of ML.NET so it's looks completely unnecessary here, but as I said in other comment, this is out of scope of this PR. #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, let use this issue for tracking this #838.


In reply to: 264433503 [](ancestors = 264433503)

Expand All @@ -115,67 +115,80 @@ internal sealed class Arguments : TransformInputBase
public bool OutputTokens;

[Argument(ArgumentType.Multiple, HelpText = "A dictionary of whitelisted terms.", ShortName = "dict", NullName = "<None>", SortOrder = 10, Hide = true)]
public TermLoaderArguments Dictionary;
internal TermLoaderArguments Dictionary;

[TGUI(Label = "Word Gram Extractor")]
[Argument(ArgumentType.Multiple, HelpText = "Ngram feature extractor to use for words (WordBag/WordHashBag).", ShortName = "wordExtractor", NullName = "<None>", SortOrder = 11)]
public INgramExtractorFactoryFactory WordFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments();

[TGUI(Label = "Char Gram Extractor")]
[Argument(ArgumentType.Multiple, HelpText = "Ngram feature extractor to use for characters (WordBag/WordHashBag).", ShortName = "charExtractor", NullName = "<None>", SortOrder = 12)]
public INgramExtractorFactoryFactory CharFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 3, AllLengths = false };

[Argument(ArgumentType.AtMostOnce, HelpText = "Normalize vectors (rows) individually by rescaling them to unit norm.", ShortName = "norm", SortOrder = 13)]
public NormFunction VectorNormalizer = NormFunction.L2;
}
[Argument(ArgumentType.Multiple, Name = "WordFeatureExtractor", HelpText = "Ngram feature extractor to use for words (WordBag/WordHashBag).", ShortName = "wordExtractor", NullName = "<None>", SortOrder = 11)]
internal INgramExtractorFactoryFactory WordFeatureExtractorFactory;

/// <summary>
/// Advanced options for the <see cref="TextFeaturizingEstimator"/>.
/// </summary>
public sealed class Options
{
#pragma warning disable MSML_NoInstanceInitializers // No initializers on instance fields or properties
/// <summary>
/// Dataset language.
/// </summary>
public Language TextLanguage { get; set; } = DefaultLanguage;
/// <summary>
/// Casing used for the text.
/// </summary>
public CaseMode TextCase { get; set; } = CaseMode.Lower;
/// <summary>
/// Whether to keep diacritical marks or remove them.
/// </summary>
public bool KeepDiacritics { get; set; } = false;
/// <summary>
/// Whether to keep punctuation marks or remove them.
/// </summary>
public bool KeepPunctuations { get; set; } = true;
/// <summary>
/// Whether to keep numbers or remove them.
/// </summary>
public bool KeepNumbers { get; set; } = true;
/// <summary>
/// Whether to output the transformed text tokens as an additional column.
/// The underlying state of <see cref="WordFeatureExtractorFactory"/> and <see cref="WordFeatureExtractor"/>.
/// </summary>
public bool OutputTokens { get; set; } = false;
/// <summary>
/// Vector Normalizer to use.
/// </summary>
public NormFunction VectorNormalizer { get; set; } = NormFunction.L2;
private WordBagEstimator.Options _wordFeatureExtractor;

/// <summary>
/// Whether to use stop remover or not.
/// Ngram feature extractor to use for words (WordBag/WordHashBag).
/// </summary>
public bool UseStopRemover { get; set; } = false;
public WordBagEstimator.Options WordFeatureExtractor
{
get { return _wordFeatureExtractor; }
set
{
_wordFeatureExtractor = value;
NgramExtractorTransform.NgramExtractorArguments extractor = null;
if (_wordFeatureExtractor != null)
{
extractor = new NgramExtractorTransform.NgramExtractorArguments();
extractor.NgramLength = _wordFeatureExtractor.NgramLength;
extractor.SkipLength = _wordFeatureExtractor.SkipLength;
extractor.AllLengths = _wordFeatureExtractor.AllLengths;
extractor.MaxNumTerms = _wordFeatureExtractor.MaximumNgramsCount;
extractor.Weighting = _wordFeatureExtractor.Weighting;
}
WordFeatureExtractorFactory = extractor;
}
}

Copy link
Member

@wschin wschin Mar 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Public field needs document. In addition, to internalize IFactory please follow the pattern Tom and I have built in this PR --- PR description of #2851 #Resolved

[TGUI(Label = "Char Gram Extractor")]
[Argument(ArgumentType.Multiple, Name = "CharFeatureExtractor", HelpText = "Ngram feature extractor to use for characters (WordBag/WordHashBag).", ShortName = "charExtractor", NullName = "<None>", SortOrder = 12)]
internal INgramExtractorFactoryFactory CharFeatureExtractorFactory;

/// <summary>
/// Whether to use char extractor or not.
/// The underlying state of <see cref="CharFeatureExtractorFactory"/> and <see cref="CharFeatureExtractor"/>
/// </summary>
public bool UseCharExtractor { get; set; } = true;
private WordBagEstimator.Options _charFeatureExtractor;

/// <summary>
/// Whether to use word extractor or not.
/// Ngram feature extractor to use for characters (WordBag/WordHashBag).
/// </summary>
public bool UseWordExtractor { get; set; } = true;
#pragma warning restore MSML_NoInstanceInitializers // No initializers on instance fields or properties
public WordBagEstimator.Options CharFeatureExtractor
{
get { return _charFeatureExtractor; }
set
{
_charFeatureExtractor = value;
NgramExtractorTransform.NgramExtractorArguments extractor = null;
if (_charFeatureExtractor != null)
{
extractor = new NgramExtractorTransform.NgramExtractorArguments();
extractor.NgramLength = _charFeatureExtractor.NgramLength;
extractor.SkipLength = _charFeatureExtractor.SkipLength;
extractor.AllLengths = _charFeatureExtractor.AllLengths;
extractor.MaxNumTerms = _charFeatureExtractor.MaximumNgramsCount;
extractor.Weighting = _charFeatureExtractor.Weighting;
}
CharFeatureExtractorFactory = extractor;
}
}

Copy link
Member

@wschin wschin Mar 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Public field needs document. In addition, to internalize IFactory please follow the pattern Tom and I have built in this PR --- PR description of #2851 #Resolved

[Argument(ArgumentType.AtMostOnce, HelpText = "Normalize vectors (rows) individually by rescaling them to unit norm.", ShortName = "norm", SortOrder = 13)]
public NormFunction VectorNormalizer = NormFunction.L2;

public Options()
{
WordFeatureExtractor = new WordBagEstimator.Options();
Copy link
Member

@wschin wschin Mar 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WordFeatureExtractor [](start = 16, length = 20)

This will kill the initial value of WordFeatureExtractorFactory. How about remove new NgramExtractorTransform.NgramExtractorArguments() in internal INgramExtractorFactoryFactory WordFeatureExtractorFactory = new NgramExtractorTransform.NgramExtractorArguments();? A similar comment applies to CharFeatureExtractor.
#Resolved

CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, AllLengths = false };
Copy link
Member

@ganik ganik Mar 11, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't these fields initialized to null by default? And then fallback to WordFeatureExtractorFactory / CharFeatureExtractorFactory whenever you try to use them and they are not defined. This way code flow from entrypoint / maml will be same as with API. #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WordFeatureExtractor and CharFeatureExtractor both can be null. When they are null its means do not apply the respective transforms. So, there is no way to detect if null was set by user or its a default value.

Right now, WordFeatureExtractorFactory / CharFeatureExtractorFactory is preferred if the maml is used.


In reply to: 264465314 [](ancestors = 264465314)

}
}

internal readonly string OutputColumn;
Expand Down Expand Up @@ -274,13 +287,13 @@ public bool NeedInitialSourceColumnConcatTransform
public TransformApplierParams(TextFeaturizingEstimator parent)
{
var host = parent._host;
host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.TextLanguage));
host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.Language));
host.Check(Enum.IsDefined(typeof(CaseMode), parent.OptionalSettings.TextCase));
WordExtractorFactory = parent._wordFeatureExtractor?.CreateComponent(host, parent._dictionary);
CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary);
VectorNormalizer = parent.OptionalSettings.VectorNormalizer;
Language = parent.OptionalSettings.TextLanguage;
UsePredefinedStopWordRemover = parent.OptionalSettings.UseStopRemover;
Language = parent.OptionalSettings.Language;
UsePredefinedStopWordRemover = parent.OptionalSettings.UsePredefinedStopWordRemover;
TextCase = parent.OptionalSettings.TextCase;
KeepDiacritics = parent.OptionalSettings.KeepDiacritics;
KeepPunctuations = parent.OptionalSettings.KeepPunctuations;
Expand Down Expand Up @@ -323,10 +336,9 @@ internal TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable
OptionalSettings = options;

_dictionary = null;
if (OptionalSettings.UseWordExtractor)
_wordFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments();
if (OptionalSettings.UseCharExtractor)
_charFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 3, AllLengths = false };
_wordFeatureExtractor = OptionalSettings.WordFeatureExtractorFactory;
_charFeatureExtractor = OptionalSettings.CharFeatureExtractorFactory;

}

/// <summary>
Expand Down Expand Up @@ -548,26 +560,12 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema)
}

// Factory method for SignatureDataTransform.
internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView data)
internal static IDataTransform Create(IHostEnvironment env, Options args, IDataView data)
{
var settings = new Options
{
TextLanguage = args.Language,
TextCase = args.TextCase,
KeepDiacritics = args.KeepDiacritics,
KeepPunctuations = args.KeepPunctuations,
KeepNumbers = args.KeepNumbers,
OutputTokens = args.OutputTokens,
VectorNormalizer = args.VectorNormalizer,
UseStopRemover = args.UsePredefinedStopWordRemover,
UseWordExtractor = args.WordFeatureExtractor != null,
UseCharExtractor = args.CharFeatureExtractor != null,
};

var estimator = new TextFeaturizingEstimator(env, args.Columns.Name, args.Columns.Source ?? new[] { args.Columns.Name }, settings);
var estimator = new TextFeaturizingEstimator(env, args.Columns.Name, args.Columns.Source ?? new[] { args.Columns.Name }, args);
estimator._dictionary = args.Dictionary;
estimator._wordFeatureExtractor = args.WordFeatureExtractor;
estimator._charFeatureExtractor = args.CharFeatureExtractor;
estimator._wordFeatureExtractor = args.WordFeatureExtractorFactory;
estimator._charFeatureExtractor = args.CharFeatureExtractorFactory;
return estimator.Fit(data).Transform(data) as IDataTransform;
}

Expand Down
1 change: 0 additions & 1 deletion src/Microsoft.ML.Transforms/Text/TextNormalizing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,6 @@ internal static class Defaults
public const bool KeepDiacritics = false;
public const bool KeepPunctuations = true;
public const bool KeepNumbers = true;

}

internal static bool IsColumnTypeValid(DataViewType type) => (type.GetItemType() is TextDataViewType);
Expand Down
41 changes: 41 additions & 0 deletions src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,47 @@ public sealed class WordBagEstimator : IEstimator<ITransformer>
private readonly int _maxNumTerms;
private readonly NgramExtractingEstimator.WeightingCriteria _weighting;

/// <summary>
/// Options for how the ngrams are extracted.
/// </summary>
public class Options
{
/// <summary>
/// Maximum ngram length.
/// </summary>
public int NgramLength;

/// <summary>
/// Maximum number of tokens to skip when constructing an ngram.
/// </summary>
public int SkipLength;

/// <summary>
/// Whether to store all ngram lengths up to ngramLength, or only ngramLength.
/// </summary>
public bool AllLengths;

/// <summary>
/// The maximum number of grams to store in the dictionary, for each level of ngrams,
/// from 1 (in position 0) up to ngramLength (in position ngramLength-1)
/// </summary>
public int[] MaximumNgramsCount;

/// <summary>
/// The weighting criteria.
/// </summary>
public NgramExtractingEstimator.WeightingCriteria Weighting;

public Options()
{
NgramLength = 1;
SkipLength = NgramExtractingEstimator.Defaults.SkipLength;
AllLengths = NgramExtractingEstimator.Defaults.AllLengths;
MaximumNgramsCount = new int[] { NgramExtractingEstimator.Defaults.MaxNumTerms };
Weighting = NgramExtractingEstimator.Defaults.Weighting;
}
}

/// <summary>
/// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/>
/// and outputs bag of word vector as <paramref name="outputColumnName"/>
Expand Down
2 changes: 1 addition & 1 deletion test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ Transforms.Scorer Turn the predictor model into a transform model Microsoft.ML.E
Transforms.Segregator Un-groups vector columns into sequences of rows, inverse of Group transform Microsoft.ML.Transforms.GroupingOperations Ungroup Microsoft.ML.Transforms.UngroupTransform+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Transforms.SentimentAnalyzer Uses a pretrained sentiment model to score input strings Microsoft.ML.Transforms.Text.TextAnalytics AnalyzeSentiment Microsoft.ML.Transforms.Text.SentimentAnalyzingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Transforms.TensorFlowScorer Transforms the data using the TensorFlow model. Microsoft.ML.Transforms.TensorFlowTransformer TensorFlowScorer Microsoft.ML.Transforms.TensorFlowEstimator+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Transforms.TextFeaturizer A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text. Microsoft.ML.Transforms.Text.TextAnalytics TextTransform Microsoft.ML.Transforms.Text.TextFeaturizingEstimator+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Transforms.TextFeaturizer A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text. Microsoft.ML.Transforms.Text.TextAnalytics TextTransform Microsoft.ML.Transforms.Text.TextFeaturizingEstimator+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Transforms.TextToKeyConverter Converts input values (words, numbers, etc.) to index in a dictionary. Microsoft.ML.Transforms.Categorical TextToKey Microsoft.ML.Transforms.ValueToKeyMappingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Transforms.TrainTestDatasetSplitter Split the dataset into train and test sets Microsoft.ML.EntryPoints.TrainTestSplit Split Microsoft.ML.EntryPoints.TrainTestSplit+Input Microsoft.ML.EntryPoints.TrainTestSplit+Output
Transforms.TreeLeafFeaturizer Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices. Microsoft.ML.Data.TreeFeaturize Featurizer Microsoft.ML.Data.TreeEnsembleFeaturizerTransform+ArgumentsForEntryPoint Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ public void TrainSentiment()
{
OutputTokens = true,
KeepPunctuations = false,
UseStopRemover = true,
UsePredefinedStopWordRemover = true,
VectorNormalizer = TextFeaturizingEstimator.NormFunction.None,
UseCharExtractor = false,
UseWordExtractor = false,
CharFeatureExtractor = null,
WordFeatureExtractor = null,
}, "SentimentText").Fit(loader).Transform(loader);

var trans = mlContext.Transforms.Text.ApplyWordEmbedding("Features", "WordEmbeddings_TransformedText",
Expand Down
Loading