Skip to content

Commit 9d2b198

Browse files
authored
Updated handling of missing values with LightGBM, and added ability to use (0) as missing value (#4695)
* Update LightGbmTrainerBase.cs * Update LightGbmTrainerBase.cs * Added UseZeroAsMissingValue as a modifiable LightGBM flag * Update core_manifest.json * Updated baseline files for LightGBMClassificationTest() * Updated baseline files for GossLightGBMTest() * Updated baseline files for DartLightGBMTest() * Revert "Updated baseline files for DartLightGBMTest()" This reverts commit 4397725. * Revert "Updated baseline files for GossLightGBMTest()" This reverts commit 06754f5. * Revert "Updated baseline files for LightGBMClassificationTest()" This reverts commit c72ed38. * Update TestPredictors.cs * Revert "Revert "Updated baseline files for LightGBMClassificationTest()"" This reverts commit 9c426e1. * Revert "Revert "Updated baseline files for GossLightGBMTest()"" This reverts commit 046f5b0. * Revert "Revert "Updated baseline files for DartLightGBMTest()"" This reverts commit 3f082e0. * Updated test datasets and LightGbm flag shortnames * Added test to confirm run-time behavior of LightGBM doesn't change * Update core_manifest.json * Added correct baseline for LightGBMPreviousModelBaselineTest() * Added previously trained model at accessible location for all builds * Update used library for NetFx builds * Updated LightGBMPreviousModelBaselineTest * Update LightGBMPreviousModelBaselineTest
1 parent 1e22d1d commit 9d2b198

24 files changed

+4459
-3591
lines changed

src/Microsoft.ML.AutoML/TrainerExtensions/SweepableParams.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ public static IEnumerable<SweepableParam> BuildLightGbmParams()
105105
new SweepableDiscreteParam("MinimumExampleCountPerLeaf", new object[] { 1, 10, 20, 50 }),
106106
new SweepableDiscreteParam("UseCategoricalSplit", new object[] { true, false }),
107107
new SweepableDiscreteParam("HandleMissingValue", new object[] { true, false }),
108+
new SweepableDiscreteParam("UseZeroAsMissingValue", new object[] { true, false }),
108109
new SweepableDiscreteParam("MinimumExampleCountPerGroup", new object[] { 10, 50, 100, 200 }),
109110
new SweepableDiscreteParam("MaximumCategoricalSplitPointCount", new object[] { 8, 16, 32, 64 }),
110111
new SweepableDiscreteParam("CategoricalSmoothing", new object[] { 1, 10, 20 }),

src/Microsoft.ML.LightGbm/LightGbmTrainerBase.cs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@ public class OptionsBase : TrainerInputBaseWithGroupId
5555
{nameof(MaximumCategoricalSplitPointCount), "max_cat_threshold" },
5656
{nameof(CategoricalSmoothing), "cat_smooth" },
5757
{nameof(L2CategoricalRegularization), "cat_l2" },
58-
{nameof(HandleMissingValue), "use_missing" }
58+
{nameof(HandleMissingValue), "use_missing" },
59+
{nameof(UseZeroAsMissingValue), "zero_as_missing" }
5960
};
6061

6162
private protected string GetOptionName(string name)
@@ -174,10 +175,17 @@ private protected OptionsBase() { }
174175
/// <summary>
175176
/// Whether to enable special handling of missing value or not.
176177
/// </summary>
177-
[Argument(ArgumentType.AtMostOnce, HelpText = "Enable special handling of missing value or not.")]
178+
[Argument(ArgumentType.AtMostOnce, HelpText = "Enable special handling of missing value or not.", ShortName = "hmv")]
178179
[TlcModule.SweepableDiscreteParam("UseMissing", new object[] { true, false })]
179180
public bool HandleMissingValue = true;
180181

182+
/// <summary>
183+
/// Whether to enable the usage of zero (0) as missing value.
184+
/// </summary>
185+
[Argument(ArgumentType.AtMostOnce, HelpText = "Enable usage of zero (0) as missing value.", ShortName = "uzam")]
186+
[TlcModule.SweepableDiscreteParam("UseZeroAsMissing", new object[] { true, false })]
187+
public bool UseZeroAsMissingValue = false;
188+
181189
/// <summary>
182190
/// The minimum number of data points per categorical group.
183191
/// </summary>
@@ -259,6 +267,7 @@ internal virtual Dictionary<string, object> ToDictionary(IHost host)
259267

260268
res[GetOptionName(nameof(MaximumBinCountPerFeature))] = MaximumBinCountPerFeature;
261269
res[GetOptionName(nameof(HandleMissingValue))] = HandleMissingValue;
270+
res[GetOptionName(nameof(UseZeroAsMissingValue))] = UseZeroAsMissingValue;
262271
res[GetOptionName(nameof(MinimumExampleCountPerGroup))] = MinimumExampleCountPerGroup;
263272
res[GetOptionName(nameof(MaximumCategoricalSplitPointCount))] = MaximumCategoricalSplitPointCount;
264273
res[GetOptionName(nameof(CategoricalSmoothing))] = CategoricalSmoothing;
@@ -436,7 +445,7 @@ private protected virtual void GetDefaultParameters(IChannel ch, int numRow, boo
436445

437446
private FloatLabelCursor.Factory CreateCursorFactory(RoleMappedData data)
438447
{
439-
var loadFlags = CursOpt.AllLabels | CursOpt.Features;
448+
var loadFlags = CursOpt.AllLabels | CursOpt.AllFeatures;
440449
if (PredictionKind == PredictionKind.Ranking)
441450
loadFlags |= CursOpt.Group;
442451

test/BaselineOutput/Common/EntryPoints/core_manifest.json

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11636,6 +11636,9 @@
1163611636
"Name": "HandleMissingValue",
1163711637
"Type": "Bool",
1163811638
"Desc": "Enable special handling of missing value or not.",
11639+
"Aliases": [
11640+
"hmv"
11641+
],
1163911642
"Required": false,
1164011643
"SortOrder": 150.0,
1164111644
"IsNullable": false,
@@ -11648,6 +11651,25 @@
1164811651
]
1164911652
}
1165011653
},
11654+
{
11655+
"Name": "UseZeroAsMissingValue",
11656+
"Type": "Bool",
11657+
"Desc": "Enable usage of zero (0) as missing value.",
11658+
"Aliases": [
11659+
"uzam"
11660+
],
11661+
"Required": false,
11662+
"SortOrder": 150.0,
11663+
"IsNullable": false,
11664+
"Default": false,
11665+
"SweepRange": {
11666+
"RangeType": "Discrete",
11667+
"Values": [
11668+
true,
11669+
false
11670+
]
11671+
}
11672+
},
1165111673
{
1165211674
"Name": "MinimumExampleCountPerGroup",
1165311675
"Type": "Int",
@@ -12133,6 +12155,9 @@
1213312155
"Name": "HandleMissingValue",
1213412156
"Type": "Bool",
1213512157
"Desc": "Enable special handling of missing value or not.",
12158+
"Aliases": [
12159+
"hmv"
12160+
],
1213612161
"Required": false,
1213712162
"SortOrder": 150.0,
1213812163
"IsNullable": false,
@@ -12145,6 +12170,25 @@
1214512170
]
1214612171
}
1214712172
},
12173+
{
12174+
"Name": "UseZeroAsMissingValue",
12175+
"Type": "Bool",
12176+
"Desc": "Enable usage of zero (0) as missing value.",
12177+
"Aliases": [
12178+
"uzam"
12179+
],
12180+
"Required": false,
12181+
"SortOrder": 150.0,
12182+
"IsNullable": false,
12183+
"Default": false,
12184+
"SweepRange": {
12185+
"RangeType": "Discrete",
12186+
"Values": [
12187+
true,
12188+
false
12189+
]
12190+
}
12191+
},
1214812192
{
1214912193
"Name": "MinimumExampleCountPerGroup",
1215012194
"Type": "Int",
@@ -12630,6 +12674,9 @@
1263012674
"Name": "HandleMissingValue",
1263112675
"Type": "Bool",
1263212676
"Desc": "Enable special handling of missing value or not.",
12677+
"Aliases": [
12678+
"hmv"
12679+
],
1263312680
"Required": false,
1263412681
"SortOrder": 150.0,
1263512682
"IsNullable": false,
@@ -12642,6 +12689,25 @@
1264212689
]
1264312690
}
1264412691
},
12692+
{
12693+
"Name": "UseZeroAsMissingValue",
12694+
"Type": "Bool",
12695+
"Desc": "Enable usage of zero (0) as missing value.",
12696+
"Aliases": [
12697+
"uzam"
12698+
],
12699+
"Required": false,
12700+
"SortOrder": 150.0,
12701+
"IsNullable": false,
12702+
"Default": false,
12703+
"SweepRange": {
12704+
"RangeType": "Discrete",
12705+
"Values": [
12706+
true,
12707+
false
12708+
]
12709+
}
12710+
},
1264512711
{
1264612712
"Name": "MinimumExampleCountPerGroup",
1264712713
"Type": "Int",
@@ -13088,6 +13154,9 @@
1308813154
"Name": "HandleMissingValue",
1308913155
"Type": "Bool",
1309013156
"Desc": "Enable special handling of missing value or not.",
13157+
"Aliases": [
13158+
"hmv"
13159+
],
1309113160
"Required": false,
1309213161
"SortOrder": 150.0,
1309313162
"IsNullable": false,
@@ -13100,6 +13169,25 @@
1310013169
]
1310113170
}
1310213171
},
13172+
{
13173+
"Name": "UseZeroAsMissingValue",
13174+
"Type": "Bool",
13175+
"Desc": "Enable usage of zero (0) as missing value.",
13176+
"Aliases": [
13177+
"uzam"
13178+
],
13179+
"Required": false,
13180+
"SortOrder": 150.0,
13181+
"IsNullable": false,
13182+
"Default": false,
13183+
"SweepRange": {
13184+
"RangeType": "Discrete",
13185+
"Values": [
13186+
true,
13187+
false
13188+
]
13189+
}
13190+
},
1310313191
{
1310413192
"Name": "MinimumExampleCountPerGroup",
1310513193
"Type": "Int",

test/BaselineOutput/Common/LightGBM/LightGBMDart-CV-breast-cancer.dart-out.txt

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,42 +12,42 @@ Confusion table
1212
||======================
1313
PREDICTED || positive | negative | Recall
1414
TRUTH ||======================
15-
positive || 128 | 6 | 0.9552
16-
negative || 10 | 218 | 0.9561
15+
positive || 124 | 10 | 0.9254
16+
negative || 9 | 219 | 0.9605
1717
||======================
18-
Precision || 0.9275 | 0.9732 |
19-
OVERALL 0/1 ACCURACY: 0.955801
20-
LOG LOSS/instance: 0.301908
18+
Precision || 0.9323 | 0.9563 |
19+
OVERALL 0/1 ACCURACY: 0.947514
20+
LOG LOSS/instance: 0.303740
2121
Test-set entropy (prior Log-Loss/instance): 0.950799
22-
LOG-LOSS REDUCTION (RIG): 0.682470
23-
AUC: 0.982473
22+
LOG-LOSS REDUCTION (RIG): 0.680543
23+
AUC: 0.984944
2424
TEST POSITIVE RATIO: 0.3175 (107.0/(107.0+230.0))
2525
Confusion table
2626
||======================
2727
PREDICTED || positive | negative | Recall
2828
TRUTH ||======================
29-
positive || 100 | 7 | 0.9346
30-
negative || 8 | 222 | 0.9652
29+
positive || 97 | 10 | 0.9065
30+
negative || 10 | 220 | 0.9565
3131
||======================
32-
Precision || 0.9259 | 0.9694 |
33-
OVERALL 0/1 ACCURACY: 0.955490
34-
LOG LOSS/instance: 0.290926
32+
Precision || 0.9065 | 0.9565 |
33+
OVERALL 0/1 ACCURACY: 0.940653
34+
LOG LOSS/instance: 0.297583
3535
Test-set entropy (prior Log-Loss/instance): 0.901650
36-
LOG-LOSS REDUCTION (RIG): 0.677340
37-
AUC: 0.992076
36+
LOG-LOSS REDUCTION (RIG): 0.669957
37+
AUC: 0.991833
3838

3939
OVERALL RESULTS
4040
---------------------------------------
41-
AUC: 0.987274 (0.0048)
42-
Accuracy: 0.955645 (0.0002)
43-
Positive precision: 0.926731 (0.0008)
44-
Positive recall: 0.944902 (0.0103)
45-
Negative precision: 0.971323 (0.0019)
46-
Negative recall: 0.960679 (0.0045)
47-
Log-loss: 0.296417 (0.0055)
48-
Log-loss reduction: 0.679905 (0.0026)
49-
F1 Score: 0.935705 (0.0055)
50-
AUPRC: 0.969894 (0.0121)
41+
AUC: 0.988388 (0.0034)
42+
Accuracy: 0.944083 (0.0034)
43+
Positive precision: 0.919436 (0.0129)
44+
Positive recall: 0.915958 (0.0094)
45+
Negative precision: 0.956427 (0.0001)
46+
Negative recall: 0.958524 (0.0020)
47+
Log-loss: 0.300661 (0.0031)
48+
Log-loss reduction: 0.675250 (0.0053)
49+
F1 Score: 0.917691 (0.0111)
50+
AUPRC: 0.972137 (0.0107)
5151

5252
---------------------------------------
5353
Physical memory usage(MB): %Number%
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
LightGBM
22
AUC Accuracy Positive precision Positive recall Negative precision Negative recall Log-loss Log-loss reduction F1 Score AUPRC /iter /lr /nl /mil /booster /nt Learner Name Train Dataset Test Dataset Results File Run Time Physical Memory Virtual Memory Command Line Settings
3-
0.987274 0.955645 0.926731 0.944902 0.971323 0.960679 0.296417 0.679905 0.935705 0.969894 10 0.2 20 10 dart 1 LightGBM %Data% %Output% 99 0 0 maml.exe CV tr=LightGBM{nt=1 iter=10 booster=dart lr=0.2 mil=10 nl=20} threads=- cache=- dout=%Output% loader=Text{sparse- col=Attr:TX:6 col=Label:0 col=Features:1-5,6,7-9} data=%Data% seed=1 /iter:10;/lr:0.2;/nl:20;/mil:10;/booster:dart;/nt:1
3+
0.988388 0.944083 0.919436 0.915958 0.956427 0.958524 0.300661 0.67525 0.917691 0.972137 10 0.2 20 10 dart 1 LightGBM %Data% %Output% 99 0 0 maml.exe CV tr=LightGBM{nt=1 iter=10 booster=dart lr=0.2 mil=10 nl=20} threads=- cache=- dout=%Output% loader=Text{sparse- col=Attr:TX:6 col=Label:0 col=Features:1-5,6,7-9} data=%Data% seed=1 /iter:10;/lr:0.2;/nl:20;/mil:10;/booster:dart;/nt:1
44

0 commit comments

Comments
 (0)