@@ -55,8 +55,9 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
5555 /// <param name="catalog">The text-related transform's catalog.</param>
5656 /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
5757 /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
58- /// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
59- public static TokenizingByCharactersEstimator TokenizeCharacters ( this TransformsCatalog . TextTransforms catalog ,
58+ /// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
59+ /// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
60+ public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys ( this TransformsCatalog . TextTransforms catalog ,
6061 string outputColumnName ,
6162 string inputColumnName = null ,
6263 bool useMarkerCharacters = CharTokenizingDefaults . UseMarkerCharacters )
@@ -67,10 +68,11 @@ public static TokenizingByCharactersEstimator TokenizeCharacters(this Transforms
6768 /// Tokenize incoming text in input columns and output the tokens as output columns.
6869 /// </summary>
6970 /// <param name="catalog">The text-related transform's catalog.</param>
70- /// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
71+ /// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
72+ /// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
7173 /// <param name="columns">Pairs of columns to run the tokenization on.</param>
7274
73- public static TokenizingByCharactersEstimator TokenizeCharacters ( this TransformsCatalog . TextTransforms catalog ,
75+ public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys ( this TransformsCatalog . TextTransforms catalog ,
7476 bool useMarkerCharacters = CharTokenizingDefaults . UseMarkerCharacters ,
7577 params ColumnOptions [ ] columns )
7678 => new TokenizingByCharactersEstimator ( Contracts . CheckRef ( catalog , nameof ( catalog ) ) . GetEnvironment ( ) , useMarkerCharacters , ColumnOptions . ConvertToValueTuples ( columns ) ) ;
@@ -157,29 +159,18 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
157159 /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
158160 /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
159161 /// <param name="separators">The separators to use (uses space character by default).</param>
160- public static WordTokenizingEstimator TokenizeWords ( this TransformsCatalog . TextTransforms catalog ,
162+ public static WordTokenizingEstimator TokenizeIntoWords ( this TransformsCatalog . TextTransforms catalog ,
161163 string outputColumnName ,
162164 string inputColumnName = null ,
163165 char [ ] separators = null )
164166 => new WordTokenizingEstimator ( Contracts . CheckRef ( catalog , nameof ( catalog ) ) . GetEnvironment ( ) , outputColumnName , inputColumnName , separators ) ;
165167
166- /// <summary>
167- /// Tokenizes incoming text in input columns and outputs the tokens using <paramref name="separators"/> as separators.
168- /// </summary>
169- /// <param name="catalog">The text-related transform's catalog.</param>
170- /// <param name="columns">Pairs of columns to run the tokenization on.</param>
171- /// <param name="separators">The separators to use (uses space character by default).</param>
172- public static WordTokenizingEstimator TokenizeWords ( this TransformsCatalog . TextTransforms catalog ,
173- ( string outputColumnName , string inputColumnName ) [ ] columns ,
174- char [ ] separators = null )
175- => new WordTokenizingEstimator ( Contracts . CheckRef ( catalog , nameof ( catalog ) ) . GetEnvironment ( ) , columns , separators ) ;
176-
177168 /// <summary>
178169 /// Tokenizes incoming text in input columns, using per-column configurations, and outputs the tokens.
179170 /// </summary>
180171 /// <param name="catalog">The text-related transform's catalog.</param>
181172 /// <param name="columns">Pairs of columns to run the tokenization on.</param>
182- public static WordTokenizingEstimator TokenizeWords ( this TransformsCatalog . TextTransforms catalog ,
173+ public static WordTokenizingEstimator TokenizeIntoWords ( this TransformsCatalog . TextTransforms catalog ,
183174 params WordTokenizingEstimator . ColumnOptions [ ] columns )
184175 => new WordTokenizingEstimator ( Contracts . CheckRef ( catalog , nameof ( catalog ) ) . GetEnvironment ( ) , columns ) ;
185176
@@ -243,24 +234,6 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC
243234 StopWordsRemovingEstimator . Language language = StopWordsRemovingEstimator . Language . English )
244235 => new StopWordsRemovingEstimator ( Contracts . CheckRef ( catalog , nameof ( catalog ) ) . GetEnvironment ( ) , outputColumnName , inputColumnName , language ) ;
245236
246- /// <summary>
247- /// Removes stop words from incoming token streams in input columns
248- /// and outputs the token streams without stop words as output columns.
249- /// </summary>
250- /// <param name="catalog">The text-related transform's catalog.</param>
251- /// <param name="columns">Pairs of columns to remove stop words on.</param>
252- /// <param name="language">Langauge of the input text columns <paramref name="columns"/>.</param>
253- /// <example>
254- /// <format type="text/markdown">
255- /// <]
257- /// ]]></format>
258- /// </example>
259- public static StopWordsRemovingEstimator RemoveDefaultStopWords ( this TransformsCatalog . TextTransforms catalog ,
260- ( string outputColumnName , string inputColumnName ) [ ] columns ,
261- StopWordsRemovingEstimator . Language language = StopWordsRemovingEstimator . Language . English )
262- => new StopWordsRemovingEstimator ( Contracts . CheckRef ( catalog , nameof ( catalog ) ) . GetEnvironment ( ) , columns , language ) ;
263-
264237 /// <summary>
265238 /// Removes stop words from incoming token streams in <paramref name="inputColumnName"/>
266239 /// and outputs the token streams without stopwords as <paramref name="outputColumnName"/>.
@@ -281,24 +254,6 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa
281254 params string [ ] stopwords )
282255 => new CustomStopWordsRemovingEstimator ( Contracts . CheckRef ( catalog , nameof ( catalog ) ) . GetEnvironment ( ) , outputColumnName , inputColumnName , stopwords ) ;
283256
284- /// <summary>
285- /// Removes stop words from incoming token streams in input columns
286- /// and outputs the token streams without stop words as output columns.
287- /// </summary>
288- /// <param name="catalog">The text-related transform's catalog.</param>
289- /// <param name="columns">Pairs of columns to remove stop words on.</param>
290- /// <param name="stopwords">Array of words to remove.</param>
291- /// <example>
292- /// <format type="text/markdown">
293- /// <]
295- /// ]]></format>
296- /// </example>
297- public static CustomStopWordsRemovingEstimator RemoveStopWords ( this TransformsCatalog . TextTransforms catalog ,
298- ( string outputColumnName , string inputColumnName ) [ ] columns ,
299- params string [ ] stopwords )
300- => new CustomStopWordsRemovingEstimator ( Contracts . CheckRef ( catalog , nameof ( catalog ) ) . GetEnvironment ( ) , columns , stopwords ) ;
301-
302257 /// <summary>
303258 /// Produces a bag of counts of ngrams (sequences of consecutive words) in <paramref name="inputColumnName"/>
304259 /// and outputs bag of word vector as <paramref name="outputColumnName"/>
0 commit comments