-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Exposed ngram extraction options in TextFeaturizer #2911
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d42d923
4f9bab8
88d6604
d3e2841
2e66dce
f2ecc95
7c5b324
e03104b
d569492
6d5e7ac
e14ac38
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,7 +16,7 @@ | |
| using Microsoft.ML.Runtime; | ||
| using Microsoft.ML.Transforms.Text; | ||
|
|
||
| [assembly: LoadableClass(TextFeaturizingEstimator.Summary, typeof(IDataTransform), typeof(TextFeaturizingEstimator), typeof(TextFeaturizingEstimator.Arguments), typeof(SignatureDataTransform), | ||
| [assembly: LoadableClass(TextFeaturizingEstimator.Summary, typeof(IDataTransform), typeof(TextFeaturizingEstimator), typeof(TextFeaturizingEstimator.Options), typeof(SignatureDataTransform), | ||
| TextFeaturizingEstimator.UserName, "TextTransform", TextFeaturizingEstimator.LoaderSignature)] | ||
|
|
||
| [assembly: LoadableClass(TextFeaturizingEstimator.Summary, typeof(ITransformer), typeof(TextFeaturizingEstimator), null, typeof(SignatureLoadModel), | ||
|
|
@@ -86,12 +86,12 @@ internal bool TryUnparse(StringBuilder sb) | |
| } | ||
|
|
||
| /// <summary> | ||
| /// This class exposes <see cref="NgramExtractorTransform"/>/<see cref="NgramHashExtractingTransformer"/> arguments. | ||
| /// Advanced options for the <see cref="TextFeaturizingEstimator"/>. | ||
| /// </summary> | ||
| internal sealed class Arguments : TransformInputBase | ||
| public sealed class Options : TransformInputBase | ||
| { | ||
| [Argument(ArgumentType.Required, HelpText = "New column definition (optional form: name:srcs).", Name = "Column", ShortName = "col", SortOrder = 1)] | ||
| public Column Columns; | ||
| internal Column Columns; | ||
|
|
||
| [Argument(ArgumentType.AtMostOnce, HelpText = "Dataset language or 'AutoDetect' to detect language per row.", ShortName = "lang", SortOrder = 3)] | ||
| public Language Language = DefaultLanguage; | ||
|
|
@@ -115,67 +115,80 @@ internal sealed class Arguments : TransformInputBase | |
| public bool OutputTokens; | ||
|
|
||
| [Argument(ArgumentType.Multiple, HelpText = "A dictionary of whitelisted terms.", ShortName = "dict", NullName = "<None>", SortOrder = 10, Hide = true)] | ||
| public TermLoaderArguments Dictionary; | ||
| internal TermLoaderArguments Dictionary; | ||
|
|
||
| [TGUI(Label = "Word Gram Extractor")] | ||
| [Argument(ArgumentType.Multiple, HelpText = "Ngram feature extractor to use for words (WordBag/WordHashBag).", ShortName = "wordExtractor", NullName = "<None>", SortOrder = 11)] | ||
| public INgramExtractorFactoryFactory WordFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments(); | ||
|
|
||
| [TGUI(Label = "Char Gram Extractor")] | ||
| [Argument(ArgumentType.Multiple, HelpText = "Ngram feature extractor to use for characters (WordBag/WordHashBag).", ShortName = "charExtractor", NullName = "<None>", SortOrder = 12)] | ||
| public INgramExtractorFactoryFactory CharFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 3, AllLengths = false }; | ||
|
|
||
| [Argument(ArgumentType.AtMostOnce, HelpText = "Normalize vectors (rows) individually by rescaling them to unit norm.", ShortName = "norm", SortOrder = 13)] | ||
| public NormFunction VectorNormalizer = NormFunction.L2; | ||
| } | ||
| [Argument(ArgumentType.Multiple, Name = "WordFeatureExtractor", HelpText = "Ngram feature extractor to use for words (WordBag/WordHashBag).", ShortName = "wordExtractor", NullName = "<None>", SortOrder = 11)] | ||
| internal INgramExtractorFactoryFactory WordFeatureExtractorFactory; | ||
|
|
||
| /// <summary> | ||
| /// Advanced options for the <see cref="TextFeaturizingEstimator"/>. | ||
| /// </summary> | ||
| public sealed class Options | ||
| { | ||
| #pragma warning disable MSML_NoInstanceInitializers // No initializers on instance fields or properties | ||
| /// <summary> | ||
| /// Dataset language. | ||
| /// </summary> | ||
| public Language TextLanguage { get; set; } = DefaultLanguage; | ||
| /// <summary> | ||
| /// Casing used for the text. | ||
| /// </summary> | ||
| public CaseMode TextCase { get; set; } = CaseMode.Lower; | ||
| /// <summary> | ||
| /// Whether to keep diacritical marks or remove them. | ||
| /// </summary> | ||
| public bool KeepDiacritics { get; set; } = false; | ||
| /// <summary> | ||
| /// Whether to keep punctuation marks or remove them. | ||
| /// </summary> | ||
| public bool KeepPunctuations { get; set; } = true; | ||
| /// <summary> | ||
| /// Whether to keep numbers or remove them. | ||
| /// </summary> | ||
| public bool KeepNumbers { get; set; } = true; | ||
| /// <summary> | ||
| /// Whether to output the transformed text tokens as an additional column. | ||
| /// The underlying state of <see cref="WordFeatureExtractorFactory"/> and <see cref="WordFeatureExtractor"/>. | ||
| /// </summary> | ||
| public bool OutputTokens { get; set; } = false; | ||
| /// <summary> | ||
| /// Vector Normalizer to use. | ||
| /// </summary> | ||
| public NormFunction VectorNormalizer { get; set; } = NormFunction.L2; | ||
| private WordBagEstimator.Options _wordFeatureExtractor; | ||
|
|
||
| /// <summary> | ||
| /// Whether to use stop remover or not. | ||
| /// Ngram feature extractor to use for words (WordBag/WordHashBag). | ||
| /// </summary> | ||
| public bool UseStopRemover { get; set; } = false; | ||
| public WordBagEstimator.Options WordFeatureExtractor | ||
| { | ||
| get { return _wordFeatureExtractor; } | ||
| set | ||
| { | ||
| _wordFeatureExtractor = value; | ||
| NgramExtractorTransform.NgramExtractorArguments extractor = null; | ||
| if (_wordFeatureExtractor != null) | ||
| { | ||
| extractor = new NgramExtractorTransform.NgramExtractorArguments(); | ||
| extractor.NgramLength = _wordFeatureExtractor.NgramLength; | ||
| extractor.SkipLength = _wordFeatureExtractor.SkipLength; | ||
| extractor.AllLengths = _wordFeatureExtractor.AllLengths; | ||
| extractor.MaxNumTerms = _wordFeatureExtractor.MaximumNgramsCount; | ||
| extractor.Weighting = _wordFeatureExtractor.Weighting; | ||
| } | ||
| WordFeatureExtractorFactory = extractor; | ||
| } | ||
| } | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Public field needs document. In addition, to internalize |
||
| [TGUI(Label = "Char Gram Extractor")] | ||
| [Argument(ArgumentType.Multiple, Name = "CharFeatureExtractor", HelpText = "Ngram feature extractor to use for characters (WordBag/WordHashBag).", ShortName = "charExtractor", NullName = "<None>", SortOrder = 12)] | ||
| internal INgramExtractorFactoryFactory CharFeatureExtractorFactory; | ||
|
|
||
| /// <summary> | ||
| /// Whether to use char extractor or not. | ||
| /// The underlying state of <see cref="CharFeatureExtractorFactory"/> and <see cref="CharFeatureExtractor"/> | ||
| /// </summary> | ||
| public bool UseCharExtractor { get; set; } = true; | ||
| private WordBagEstimator.Options _charFeatureExtractor; | ||
|
|
||
| /// <summary> | ||
| /// Whether to use word extractor or not. | ||
| /// Ngram feature extractor to use for characters (WordBag/WordHashBag). | ||
| /// </summary> | ||
| public bool UseWordExtractor { get; set; } = true; | ||
| #pragma warning restore MSML_NoInstanceInitializers // No initializers on instance fields or properties | ||
| public WordBagEstimator.Options CharFeatureExtractor | ||
| { | ||
| get { return _charFeatureExtractor; } | ||
| set | ||
| { | ||
| _charFeatureExtractor = value; | ||
| NgramExtractorTransform.NgramExtractorArguments extractor = null; | ||
| if (_charFeatureExtractor != null) | ||
| { | ||
| extractor = new NgramExtractorTransform.NgramExtractorArguments(); | ||
| extractor.NgramLength = _charFeatureExtractor.NgramLength; | ||
| extractor.SkipLength = _charFeatureExtractor.SkipLength; | ||
| extractor.AllLengths = _charFeatureExtractor.AllLengths; | ||
| extractor.MaxNumTerms = _charFeatureExtractor.MaximumNgramsCount; | ||
| extractor.Weighting = _charFeatureExtractor.Weighting; | ||
| } | ||
| CharFeatureExtractorFactory = extractor; | ||
| } | ||
| } | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Public field needs document. In addition, to internalize IFactory please follow the pattern Tom and I have built in this PR --- PR description of #2851 #Resolved |
||
| [Argument(ArgumentType.AtMostOnce, HelpText = "Normalize vectors (rows) individually by rescaling them to unit norm.", ShortName = "norm", SortOrder = 13)] | ||
| public NormFunction VectorNormalizer = NormFunction.L2; | ||
|
|
||
| public Options() | ||
| { | ||
| WordFeatureExtractor = new WordBagEstimator.Options(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This will kill the initial value of |
||
| CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, AllLengths = false }; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't these fields initialized to null by default? And then fallback to WordFeatureExtractorFactory / CharFeatureExtractorFactory whenever you try to use them and they are not defined. This way code flow from entrypoint / maml will be same as with API. #Resolved
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Right now, WordFeatureExtractorFactory / CharFeatureExtractorFactory is preferred if the maml is used. In reply to: 264465314 [](ancestors = 264465314) |
||
| } | ||
| } | ||
|
|
||
| internal readonly string OutputColumn; | ||
|
|
@@ -274,13 +287,13 @@ public bool NeedInitialSourceColumnConcatTransform | |
| public TransformApplierParams(TextFeaturizingEstimator parent) | ||
| { | ||
| var host = parent._host; | ||
| host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.TextLanguage)); | ||
| host.Check(Enum.IsDefined(typeof(Language), parent.OptionalSettings.Language)); | ||
| host.Check(Enum.IsDefined(typeof(CaseMode), parent.OptionalSettings.TextCase)); | ||
| WordExtractorFactory = parent._wordFeatureExtractor?.CreateComponent(host, parent._dictionary); | ||
| CharExtractorFactory = parent._charFeatureExtractor?.CreateComponent(host, parent._dictionary); | ||
| VectorNormalizer = parent.OptionalSettings.VectorNormalizer; | ||
| Language = parent.OptionalSettings.TextLanguage; | ||
| UsePredefinedStopWordRemover = parent.OptionalSettings.UseStopRemover; | ||
| Language = parent.OptionalSettings.Language; | ||
| UsePredefinedStopWordRemover = parent.OptionalSettings.UsePredefinedStopWordRemover; | ||
| TextCase = parent.OptionalSettings.TextCase; | ||
| KeepDiacritics = parent.OptionalSettings.KeepDiacritics; | ||
| KeepPunctuations = parent.OptionalSettings.KeepPunctuations; | ||
|
|
@@ -323,10 +336,9 @@ internal TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable | |
| OptionalSettings = options; | ||
|
|
||
| _dictionary = null; | ||
| if (OptionalSettings.UseWordExtractor) | ||
| _wordFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments(); | ||
| if (OptionalSettings.UseCharExtractor) | ||
| _charFeatureExtractor = new NgramExtractorTransform.NgramExtractorArguments() { NgramLength = 3, AllLengths = false }; | ||
| _wordFeatureExtractor = OptionalSettings.WordFeatureExtractorFactory; | ||
| _charFeatureExtractor = OptionalSettings.CharFeatureExtractorFactory; | ||
|
|
||
| } | ||
|
|
||
| /// <summary> | ||
|
|
@@ -548,26 +560,12 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) | |
| } | ||
|
|
||
| // Factory method for SignatureDataTransform. | ||
| internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView data) | ||
| internal static IDataTransform Create(IHostEnvironment env, Options args, IDataView data) | ||
| { | ||
| var settings = new Options | ||
| { | ||
| TextLanguage = args.Language, | ||
| TextCase = args.TextCase, | ||
| KeepDiacritics = args.KeepDiacritics, | ||
| KeepPunctuations = args.KeepPunctuations, | ||
| KeepNumbers = args.KeepNumbers, | ||
| OutputTokens = args.OutputTokens, | ||
| VectorNormalizer = args.VectorNormalizer, | ||
| UseStopRemover = args.UsePredefinedStopWordRemover, | ||
| UseWordExtractor = args.WordFeatureExtractor != null, | ||
| UseCharExtractor = args.CharFeatureExtractor != null, | ||
| }; | ||
|
|
||
| var estimator = new TextFeaturizingEstimator(env, args.Columns.Name, args.Columns.Source ?? new[] { args.Columns.Name }, settings); | ||
| var estimator = new TextFeaturizingEstimator(env, args.Columns.Name, args.Columns.Source ?? new[] { args.Columns.Name }, args); | ||
| estimator._dictionary = args.Dictionary; | ||
| estimator._wordFeatureExtractor = args.WordFeatureExtractor; | ||
| estimator._charFeatureExtractor = args.CharFeatureExtractor; | ||
| estimator._wordFeatureExtractor = args.WordFeatureExtractorFactory; | ||
| estimator._charFeatureExtractor = args.CharFeatureExtractorFactory; | ||
| return estimator.Fit(data).Transform(data) as IDataTransform; | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not related to your PR, but whole point of this thing is to set or autodetect language, we left autodetect outside of ML.NET so it's looks completely unnecessary here, but as I said in other comment, this is out of scope of this PR. #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, let use this issue for tracking this #838.
In reply to: 264433503 [](ancestors = 264433503)