Skip to content

Commit 868a72f

Browse files
committed
Fixed the TextTransform bug where chargrams where being computed differently for differnt settings.
1 parent ef169b2 commit 868a72f

File tree

4 files changed

+97
-76
lines changed

4 files changed

+97
-76
lines changed

src/Microsoft.ML.Transforms/Text/CharTokenizeTransform.cs

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -405,34 +405,55 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(IRow input, int iinfo)
405405
getSrc(ref src);
406406

407407
int len = 0;
408+
408409
for (int i = 0; i < src.Count; i++)
409410
{
410411
if (src.Values[i].HasChars)
411412
{
412413
len += src.Values[i].Length;
413-
if (_useMarkerChars)
414-
len += TextMarkersCount;
414+
415+
if (i > 0)
416+
len += 1; // add space character that will be added
415417
}
416418
}
417419

420+
if (_useMarkerChars)
421+
len += TextMarkersCount;
422+
418423
var values = dst.Values;
419424
if (len > 0)
420425
{
421426
if (Utils.Size(values) < len)
422427
values = new ushort[len];
423428

424429
int index = 0;
430+
431+
// VBuffer<DvText> can be a result of either concatenating text columns together
432+
// or application of word tokenizer before char tokenizer.
433+
//
434+
// Considering VBuffer<DvText> as a single text stream.
435+
// Therefore, prepend and append start and end markers only once i.e. at the start and at end of vector.
436+
// Insert spaces after every piece of text in the vector.
437+
if (_useMarkerChars)
438+
values[index++] = TextStartMarker;
439+
425440
for (int i = 0; i < src.Count; i++)
426441
{
427442
if (!src.Values[i].HasChars)
428443
continue;
429-
if (_useMarkerChars)
430-
values[index++] = TextStartMarker;
444+
445+
if (i > 0)
446+
values[index++] = ' ';
447+
431448
for (int ich = 0; ich < src.Values[i].Length; ich++)
449+
{
432450
values[index++] = src.Values[i][ich];
433-
if (_useMarkerChars)
434-
values[index++] = TextEndMarker;
451+
}
435452
}
453+
454+
if (_useMarkerChars)
455+
values[index++] = TextEndMarker;
456+
436457
Contracts.Assert(index == len);
437458
}
438459

test/Microsoft.ML.Predictor.Tests/TestAutoInference.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ public void EntryPointPipelineSweepRoles()
352352
var trainAuc = bestPipeline.PerformanceSummary.TrainingMetricValue;
353353
var testAuc = bestPipeline.PerformanceSummary.MetricValue;
354354
Assert.True((0.94 < trainAuc) && (trainAuc < 0.95));
355-
Assert.True((0.83 < testAuc) && (testAuc < 0.84));
355+
Assert.True((0.815 < testAuc) && (testAuc < 0.825));
356356

357357
var results = runner.GetOutput<IDataView>("ResultsOut");
358358
Assert.NotNull(results);

test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs

Lines changed: 68 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -75,88 +75,88 @@ public void CrossValidateSentimentModelTest()
7575

7676
//Avergae of all folds.
7777
var metrics = cv.BinaryClassificationMetrics[0];
78-
Assert.Equal(0.57023626091422708, metrics.Accuracy, 4);
79-
Assert.Equal(0.54960689910161487, metrics.Auc, 1);
80-
Assert.Equal(0.67048277219704255, metrics.Auprc, 2);
78+
Assert.Equal(0.603235747303544, metrics.Accuracy, 4);
79+
Assert.Equal(0.58811318075483943, metrics.Auc, 4);
80+
Assert.Equal(0.70302385499183984, metrics.Auprc, 4);
8181
Assert.Equal(0, metrics.Entropy, 3);
82-
Assert.Equal(0.68942642723130532, metrics.F1Score, 4);
83-
Assert.Equal(0.97695909611968434, metrics.LogLoss, 3);
84-
Assert.Equal(-3.050726259114541, metrics.LogLossReduction, 3);
85-
Assert.Equal(0.37553879310344829, metrics.NegativePrecision, 3);
86-
Assert.Equal(0.25683962264150945, metrics.NegativeRecall, 3);
87-
Assert.Equal(0.63428539173628362, metrics.PositivePrecision, 3);
88-
Assert.Equal(0.75795196364816619, metrics.PositiveRecall);
82+
Assert.Equal(0.71751777634130576, metrics.F1Score, 4);
83+
Assert.Equal(0.95263103280238037, metrics.LogLoss, 4);
84+
Assert.Equal(-0.39971801589876232, metrics.LogLossReduction, 4);
85+
Assert.Equal(0.43965517241379309, metrics.NegativePrecision, 4);
86+
Assert.Equal(0.26627358490566039, metrics.NegativeRecall, 4);
87+
Assert.Equal(0.64937737441958632, metrics.PositivePrecision, 4);
88+
Assert.Equal(0.8027426160337553, metrics.PositiveRecall);
8989
Assert.Null(metrics.ConfusionMatrix);
9090

9191
//Std. Deviation.
9292
metrics = cv.BinaryClassificationMetrics[1];
93-
Assert.Equal(0.039933230611196011, metrics.Accuracy, 4);
94-
Assert.Equal(0.021066177821462407, metrics.Auc, 1);
95-
Assert.Equal(0.045842033921572725, metrics.Auprc, 2);
93+
Assert.Equal(0.057781201848998764, metrics.Accuracy, 4);
94+
Assert.Equal(0.04249579360413544, metrics.Auc, 4);
95+
Assert.Equal(0.086083866074815427, metrics.Auprc, 4);
9696
Assert.Equal(0, metrics.Entropy, 3);
97-
Assert.Equal(0.030085767890644915, metrics.F1Score, 4);
98-
Assert.Equal(0.032906777175141941, metrics.LogLoss, 3);
99-
Assert.Equal(0.86311349745170118, metrics.LogLossReduction, 3);
100-
Assert.Equal(0.030711206896551647, metrics.NegativePrecision, 3);
101-
Assert.Equal(0.068160377358490579, metrics.NegativeRecall, 3);
102-
Assert.Equal(0.051761119891622735, metrics.PositivePrecision, 3);
103-
Assert.Equal(0.0015417072379052127, metrics.PositiveRecall);
97+
Assert.Equal(0.04718810601163604, metrics.F1Score, 4);
98+
Assert.Equal(0.063839715206238851, metrics.LogLoss, 4);
99+
Assert.Equal(4.1937544629633878, metrics.LogLossReduction, 4);
100+
Assert.Equal(0.060344827586206781, metrics.NegativePrecision, 4);
101+
Assert.Equal(0.058726415094339748, metrics.NegativeRecall, 4);
102+
Assert.Equal(0.057144364710848418, metrics.PositivePrecision, 4);
103+
Assert.Equal(0.030590717299577637, metrics.PositiveRecall);
104104
Assert.Null(metrics.ConfusionMatrix);
105105

106106
//Fold 1.
107107
metrics = cv.BinaryClassificationMetrics[2];
108-
Assert.Equal(0.53030303030303028, metrics.Accuracy, 4);
109-
Assert.Equal(0.52854072128015284, metrics.Auc, 1);
110-
Assert.Equal(0.62464073827546951, metrics.Auprc, 2);
108+
Assert.Equal(0.54545454545454541, metrics.Accuracy, 4);
109+
Assert.Equal(0.54561738715070451, metrics.Auc, 4);
110+
Assert.Equal(0.61693998891702417, metrics.Auprc, 4);
111111
Assert.Equal(0, metrics.Entropy, 3);
112-
Assert.Equal(0.65934065934065933, metrics.F1Score, 4);
113-
Assert.Equal(1.0098658732948276, metrics.LogLoss, 3);
114-
Assert.Equal(-3.9138397565662424, metrics.LogLossReduction, 3);
115-
Assert.Equal(0.34482758620689657, metrics.NegativePrecision, 3);
116-
Assert.Equal(0.18867924528301888, metrics.NegativeRecall, 3);
117-
Assert.Equal(0.58252427184466016, metrics.PositivePrecision, 3);
118-
Assert.Equal(0.759493670886076, metrics.PositiveRecall);
112+
Assert.Equal(0.67032967032967028, metrics.F1Score, 4);
113+
Assert.Equal(1.0164707480086188, metrics.LogLoss, 4);
114+
Assert.Equal(-4.59347247886215, metrics.LogLossReduction, 4);
115+
Assert.Equal(0.37931034482758619, metrics.NegativePrecision, 4);
116+
Assert.Equal(0.20754716981132076, metrics.NegativeRecall, 4);
117+
Assert.Equal(0.59223300970873782, metrics.PositivePrecision, 4);
118+
Assert.Equal(0.77215189873417722, metrics.PositiveRecall);
119119

120120
var matrix = metrics.ConfusionMatrix;
121121
Assert.Equal(2, matrix.Order);
122122
Assert.Equal(2, matrix.ClassNames.Count);
123123
Assert.Equal("positive", matrix.ClassNames[0]);
124124
Assert.Equal("negative", matrix.ClassNames[1]);
125125

126-
Assert.Equal(60, matrix[0, 0]);
127-
Assert.Equal(60, matrix["positive", "positive"]);
128-
Assert.Equal(19, matrix[0, 1]);
129-
Assert.Equal(19, matrix["positive", "negative"]);
126+
Assert.Equal(61, matrix[0, 0]);
127+
Assert.Equal(61, matrix["positive", "positive"]);
128+
Assert.Equal(18, matrix[0, 1]);
129+
Assert.Equal(18, matrix["positive", "negative"]);
130130

131-
Assert.Equal(43, matrix[1, 0]);
132-
Assert.Equal(43, matrix["negative", "positive"]);
133-
Assert.Equal(10, matrix[1, 1]);
134-
Assert.Equal(10, matrix["negative", "negative"]);
131+
Assert.Equal(42, matrix[1, 0]);
132+
Assert.Equal(42, matrix["negative", "positive"]);
133+
Assert.Equal(11, matrix[1, 1]);
134+
Assert.Equal(11, matrix["negative", "negative"]);
135135

136136
//Fold 2.
137137
metrics = cv.BinaryClassificationMetrics[3];
138-
Assert.Equal(0.61016949152542377, metrics.Accuracy, 4);
139-
Assert.Equal(0.57067307692307689, metrics.Auc, 1);
140-
Assert.Equal(0.71632480611861549, metrics.Auprc, 2);
138+
Assert.Equal(0.66101694915254239, metrics.Accuracy, 4);
139+
Assert.Equal(0.63060897435897434, metrics.Auc, 4);
140+
Assert.Equal(0.7891077210666555, metrics.Auprc, 4);
141141
Assert.Equal(0, metrics.Entropy, 3);
142-
Assert.Equal(0.71951219512195119, metrics.F1Score, 4);
143-
Assert.Equal(0.94405231894454111, metrics.LogLoss, 3);
144-
Assert.Equal(-2.1876127616628396, metrics.LogLossReduction, 3);
145-
Assert.Equal(0.40625, metrics.NegativePrecision, 3);
142+
Assert.Equal(0.76470588235294124, metrics.F1Score, 4);
143+
Assert.Equal(0.88879131759614194, metrics.LogLoss, 4);
144+
Assert.Equal(3.7940364470646255, metrics.LogLossReduction, 4);
145+
Assert.Equal(0.5, metrics.NegativePrecision, 3);
146146
Assert.Equal(0.325, metrics.NegativeRecall, 3);
147-
Assert.Equal(0.686046511627907, metrics.PositivePrecision, 3);
148-
Assert.Equal(0.75641025641025639, metrics.PositiveRecall);
147+
Assert.Equal(0.70652173913043481, metrics.PositivePrecision, 4);
148+
Assert.Equal(0.83333333333333337, metrics.PositiveRecall);
149149

150150
matrix = metrics.ConfusionMatrix;
151151
Assert.Equal(2, matrix.Order);
152152
Assert.Equal(2, matrix.ClassNames.Count);
153153
Assert.Equal("positive", matrix.ClassNames[0]);
154154
Assert.Equal("negative", matrix.ClassNames[1]);
155155

156-
Assert.Equal(59, matrix[0, 0]);
157-
Assert.Equal(59, matrix["positive", "positive"]);
158-
Assert.Equal(19, matrix[0, 1]);
159-
Assert.Equal(19, matrix["positive", "negative"]);
156+
Assert.Equal(65, matrix[0, 0]);
157+
Assert.Equal(65, matrix["positive", "positive"]);
158+
Assert.Equal(13, matrix[0, 1]);
159+
Assert.Equal(13, matrix["positive", "negative"]);
160160

161161
Assert.Equal(27, matrix[1, 0]);
162162
Assert.Equal(27, matrix["negative", "positive"]);
@@ -180,11 +180,11 @@ private void ValidateBinaryMetricsLightGBM(BinaryClassificationMetrics metrics)
180180

181181
Assert.Equal(.6111, metrics.Accuracy, 4);
182182
Assert.Equal(.8, metrics.Auc, 1);
183-
Assert.Equal(.85, metrics.Auprc, 2);
183+
Assert.Equal(0.88, metrics.Auprc, 2);
184184
Assert.Equal(1, metrics.Entropy, 3);
185185
Assert.Equal(.72, metrics.F1Score, 4);
186-
Assert.Equal(.952, metrics.LogLoss, 3);
187-
Assert.Equal(4.777, metrics.LogLossReduction, 3);
186+
Assert.Equal(0.96456100297125325, metrics.LogLoss, 4);
187+
Assert.Equal(3.5438997028746755, metrics.LogLossReduction, 4);
188188
Assert.Equal(1, metrics.NegativePrecision, 3);
189189
Assert.Equal(.222, metrics.NegativeRecall, 3);
190190
Assert.Equal(.562, metrics.PositivePrecision, 3);
@@ -211,16 +211,16 @@ private void ValidateBinaryMetricsLightGBM(BinaryClassificationMetrics metrics)
211211
private void ValidateBinaryMetrics(BinaryClassificationMetrics metrics)
212212
{
213213

214-
Assert.Equal(.5556, metrics.Accuracy, 4);
215-
Assert.Equal(.8, metrics.Auc, 1);
216-
Assert.Equal(.87, metrics.Auprc, 2);
214+
Assert.Equal(0.6111, metrics.Accuracy, 4);
215+
Assert.Equal(0.6667, metrics.Auc, 4);
216+
Assert.Equal(0.8621, metrics.Auprc, 4);
217217
Assert.Equal(1, metrics.Entropy, 3);
218-
Assert.Equal(.6923, metrics.F1Score, 4);
219-
Assert.Equal(.969, metrics.LogLoss, 3);
220-
Assert.Equal(3.083, metrics.LogLossReduction, 3);
221-
Assert.Equal(1, metrics.NegativePrecision, 3);
222-
Assert.Equal(.111, metrics.NegativeRecall, 3);
223-
Assert.Equal(.529, metrics.PositivePrecision, 3);
218+
Assert.Equal(0.72, metrics.F1Score, 2);
219+
Assert.Equal(0.9689, metrics.LogLoss, 4);
220+
Assert.Equal(3.1122, metrics.LogLossReduction, 4);
221+
Assert.Equal(1, metrics.NegativePrecision, 1);
222+
Assert.Equal(0.2222, metrics.NegativeRecall, 4);
223+
Assert.Equal(0.5625, metrics.PositivePrecision, 4);
224224
Assert.Equal(1, metrics.PositiveRecall);
225225

226226
var matrix = metrics.ConfusionMatrix;
@@ -234,10 +234,10 @@ private void ValidateBinaryMetrics(BinaryClassificationMetrics metrics)
234234
Assert.Equal(0, matrix[0, 1]);
235235
Assert.Equal(0, matrix["positive", "negative"]);
236236

237-
Assert.Equal(8, matrix[1, 0]);
238-
Assert.Equal(8, matrix["negative", "positive"]);
239-
Assert.Equal(1, matrix[1, 1]);
240-
Assert.Equal(1, matrix["negative", "negative"]);
237+
Assert.Equal(7, matrix[1, 0]);
238+
Assert.Equal(7, matrix["negative", "positive"]);
239+
Assert.Equal(2, matrix[1, 1]);
240+
Assert.Equal(2, matrix["negative", "negative"]);
241241
}
242242

243243
private LearningPipeline PreparePipeline()
@@ -344,7 +344,7 @@ private void ValidateExamples(PredictionModel<SentimentData, SentimentPrediction
344344
var predictions = model.Predict(sentiments);
345345
Assert.Equal(2, predictions.Count());
346346

347-
Assert.True(predictions.ElementAt(0).Sentiment.IsFalse);
347+
Assert.True(predictions.ElementAt(0).Sentiment.IsTrue);
348348
Assert.True(predictions.ElementAt(1).Sentiment.IsTrue);
349349

350350
}

test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/SentimentPredictionTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ public void TrainAndPredictSentimentModelWithDirectionInstantiationTest()
9292
var sentiments = GetTestData();
9393
var predictions = model.Predict(sentiments, false);
9494
Assert.Equal(2, predictions.Count());
95-
Assert.True(predictions.ElementAt(0).Sentiment.IsFalse);
95+
Assert.True(predictions.ElementAt(0).Sentiment.IsTrue);
9696
Assert.True(predictions.ElementAt(1).Sentiment.IsTrue);
9797

9898
// Get feature importance based on feature gain during training

0 commit comments

Comments
 (0)