1+ // Licensed to the .NET Foundation under one or more agreements.
2+ // The .NET Foundation licenses this file to you under the MIT license.
3+ // See the LICENSE file in the project root for more information.
4+
5+ using System ;
6+ using Microsoft . ML . Functional . Tests . Datasets ;
7+ using Microsoft . ML . RunTests ;
8+ using Microsoft . ML . TestFramework ;
9+ using Microsoft . ML . Trainers ;
10+ using Microsoft . ML . Transforms ;
11+ using Microsoft . ML . Transforms . Text ;
12+ using Xunit ;
13+ using Xunit . Abstractions ;
14+
15+ namespace Microsoft . ML . Functional . Tests
16+ {
17+ public class DataTransformation : BaseTestClass
18+ {
19+ public DataTransformation ( ITestOutputHelper output ) : base ( output )
20+ {
21+ }
22+
23+ /// <summary>
24+ /// Extensibility: Add a new column that is a function of other columns.
25+ /// </summary>
26+ [ Fact ]
27+ void ExtensibilityAddAColumnAsAFunctionOfMultipleColumns ( )
28+ {
29+ // Concurrency must be 1 to assure that the mapping is done sequentially.
30+ var mlContext = new MLContext ( seed : 1 , conc : 1 ) ;
31+
32+ // Load the Iris dataset
33+ var data = mlContext . Data . LoadFromTextFile < Iris > (
34+ GetDataPath ( TestDatasets . iris . trainFilename ) ,
35+ hasHeader : TestDatasets . iris . fileHasHeader ,
36+ separatorChar : TestDatasets . iris . fileSeparator ) ;
37+
38+ // Subsample it down to the first 10 rows.
39+ int numSamples = 10 ;
40+ data = mlContext . Data . TakeRows ( data , numSamples ) ;
41+
42+ // Create a stand-alone function to produce a random number.
43+ float angiospermCosine ( float petalWidth , float petalLength , float sepalWidth , float sepalLength )
44+ {
45+ var petalMagnitude = Math . Sqrt ( petalWidth * petalWidth + petalLength * petalLength ) ;
46+ var sepalMagnitude = Math . Sqrt ( sepalWidth * sepalWidth + sepalLength * sepalLength ) ;
47+ return ( float ) ( ( petalWidth * sepalWidth + petalLength * sepalLength ) / ( petalMagnitude * sepalMagnitude ) ) ;
48+ }
49+
50+ // Create a function that generates a column.
51+ Action < Iris , IrisWithOneExtraColumn > generateGroupId = ( input , output ) =>
52+ {
53+ output . Label = input . Label ;
54+ output . Float1 = angiospermCosine ( input . PetalLength , input . PetalWidth , input . SepalLength , input . SepalWidth ) ;
55+ output . PetalLength = input . PetalLength ;
56+ output . PetalWidth = input . PetalWidth ;
57+ output . SepalLength = input . SepalLength ;
58+ output . SepalWidth = input . SepalWidth ;
59+ } ;
60+
61+ // Create a pipeline to execute the custom function.
62+ var pipeline = mlContext . Transforms . CustomMapping ( generateGroupId , null ) ;
63+
64+ // Transform the data.
65+ var transformedData = pipeline . Fit ( data ) . Transform ( data ) ;
66+
67+ // Verify that the column has the correct data.
68+ var transformedRows = mlContext . Data . CreateEnumerable < IrisWithOneExtraColumn > ( transformedData , reuseRowObject : true ) ;
69+ foreach ( var row in transformedRows )
70+ {
71+ var cosineDistance = angiospermCosine ( row . PetalLength , row . PetalWidth , row . SepalLength , row . SepalWidth ) ;
72+ Assert . Equal ( cosineDistance , row . Float1 ) ;
73+ }
74+ }
75+
76+ /// <summary>
77+ /// Extensibility: Add multiple new columns.
78+ /// </summary>
79+ [ Fact ]
80+ void ExtensibilityAddingTwoColumns ( )
81+ {
82+ // Concurrency must be 1 to assure that the mapping is done sequentially.
83+ var mlContext = new MLContext ( seed : 1 , conc : 1 ) ;
84+
85+ // Load the Iris dataset
86+ var data = mlContext . Data . LoadFromTextFile < Iris > (
87+ GetDataPath ( TestDatasets . iris . trainFilename ) ,
88+ hasHeader : TestDatasets . iris . fileHasHeader ,
89+ separatorChar : TestDatasets . iris . fileSeparator ) ;
90+
91+ // Subsample it down to the first 10 rows.
92+ int numSamples = 10 ;
93+ data = mlContext . Data . TakeRows ( data , numSamples ) ;
94+
95+ // Create a function that generates a column.
96+ Action < Iris , IrisWithTwoExtraColumns > generateGroupId = ( input , output ) =>
97+ {
98+ output . Label = input . Label ;
99+ output . Float1 = GetRandomNumber ( 1 + input . Label + input . PetalLength + input . PetalWidth + input . SepalLength + input . SepalWidth ) ;
100+ output . Float2 = GetRandomNumber ( 2 + input . Label + input . PetalLength + input . PetalWidth + input . SepalLength + input . SepalWidth ) ;
101+ output . PetalLength = input . PetalLength ;
102+ output . PetalWidth = input . PetalWidth ;
103+ output . SepalLength = input . SepalLength ;
104+ output . SepalWidth = input . SepalWidth ;
105+ } ;
106+
107+ // Create a pipeline to execute the custom function.
108+ var pipeline = mlContext . Transforms . CustomMapping ( generateGroupId , null ) ;
109+
110+ // Transform the data.
111+ var transformedData = pipeline . Fit ( data ) . Transform ( data ) ;
112+
113+ // Verify that the column has the correct data.
114+ var transformedRows = mlContext . Data . CreateEnumerable < IrisWithTwoExtraColumns > ( transformedData , reuseRowObject : true ) ;
115+ foreach ( var row in transformedRows )
116+ {
117+ var randomNumber1 = GetRandomNumber ( 1 + row . Label + row . PetalLength + row . PetalWidth + row . SepalLength + row . SepalWidth ) ;
118+ var randomNumber2 = GetRandomNumber ( 2 + row . Label + row . PetalLength + row . PetalWidth + row . SepalLength + row . SepalWidth ) ;
119+ Assert . Equal ( randomNumber1 , row . Float1 ) ;
120+ Assert . Equal ( randomNumber2 , row . Float2 ) ;
121+ }
122+ }
123+
124+ /// <summary>
125+ /// Extensibility: Featurize text using custom word-grams, char-grams, and normalization.
126+ /// </summary>
127+ [ Fact ]
128+ void ExtensibilityModifyTextFeaturization ( )
129+ {
130+ // Concurrency must be 1 to assure that the mapping is done sequentially.
131+ var mlContext = new MLContext ( seed : 1 , conc : 1 ) ;
132+
133+ var data = mlContext . Data . LoadFromTextFile < TweetSentiment > ( GetDataPath ( TestDatasets . Sentiment . trainFilename ) ,
134+ hasHeader : TestDatasets . Sentiment . fileHasHeader ,
135+ separatorChar : TestDatasets . Sentiment . fileSeparator ) ;
136+
137+ // Create a training pipeline.
138+ // TODO #2802: Update FeaturizeText to allow specifications of word-grams and char-grams.
139+ var pipeline = mlContext . Transforms . Text . FeaturizeText ( "Features" , new string [ ] { "SentimentText" } ,
140+ new TextFeaturizingEstimator . Options
141+ {
142+ UseCharExtractor = true ,
143+ UseWordExtractor = true ,
144+ VectorNormalizer = TextFeaturizingEstimator . TextNormKind . L1
145+ } )
146+ . AppendCacheCheckpoint ( mlContext )
147+ . Append ( mlContext . BinaryClassification . Trainers . StochasticDualCoordinateAscent (
148+ new SdcaBinaryTrainer . Options { NumThreads = 1 } ) ) ;
149+
150+ // Train the model.
151+ var model = pipeline . Fit ( data ) ;
152+
153+ // Evaluate the model.
154+ var scoredData = model . Transform ( data ) ;
155+ var metrics = mlContext . BinaryClassification . Evaluate ( scoredData ) ;
156+
157+ // Check that the metrics returned are valid.
158+ Common . AssertMetrics ( metrics ) ;
159+ }
160+
161+ /// <summary>
162+ /// Extensibility: Apply a normalizer to columns in the dataset.
163+ /// </summary>
164+ [ Fact ]
165+ void ExtensibilityNormalizeColumns ( )
166+ {
167+ // Concurrency must be 1 to assure that the mapping is done sequentially.
168+ var mlContext = new MLContext ( seed : 1 , conc : 1 ) ;
169+
170+ // Load the Iris dataset.
171+ var data = mlContext . Data . LoadFromTextFile < Iris > (
172+ GetDataPath ( TestDatasets . iris . trainFilename ) ,
173+ hasHeader : TestDatasets . iris . fileHasHeader ,
174+ separatorChar : TestDatasets . iris . fileSeparator ) ;
175+
176+ // Compose the transformation.
177+ var pipeline = mlContext . Transforms . Concatenate ( "Features" , Iris . Features )
178+ . Append ( mlContext . Transforms . Normalize ( "Features" , mode : NormalizingEstimator . NormalizerMode . MinMax ) ) ;
179+
180+ // Transform the data.
181+ var transformedData = pipeline . Fit ( data ) . Transform ( data ) ;
182+
183+ // Validate that the data was normalized to between -1 and 1.
184+ var dataEnumerator = mlContext . Data . CreateEnumerable < FeatureColumn > ( transformedData , true ) ;
185+ foreach ( var row in dataEnumerator )
186+ // Verify per-slot normalization.
187+ for ( int i = 0 ; i < row . Features . Length ; i ++ )
188+ Assert . InRange ( row . Features [ i ] , - 1 , 1 ) ;
189+ }
190+
191+ private float GetRandomNumber ( float number )
192+ {
193+ var seed = ( int ) ( 10 * number ) ;
194+ var rng = new Random ( seed ) ;
195+ return ( float ) rng . NextDouble ( ) ;
196+ }
197+ }
198+ }
0 commit comments