Skip to content

Commit f961e37

Browse files
authored
add more available xgboost learning params (#759)
1 parent ac998f7 commit f961e37

File tree

3 files changed

+73
-25
lines changed

3 files changed

+73
-25
lines changed

scripts/image_build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ echo 'get_ipython().magic(u"%autoreload 2")' >> $IPYTHON_STARTUP/00-first.py
115115
curl https://raw.githubusercontent.com/sql-machine-learning/sqlflow/develop/example/jupyter/example.ipynb --output /workspace/example.ipynb
116116

117117
# 9. install xgboost-launcher
118-
pip install xgboost-launcher==0.0.3
118+
pip install xgboost-launcher==0.0.4
119119

120120
# 10. install Hadoop to use as the client when writing CSV to hive tables
121121
HADOOP_URL=https://archive.apache.org/dist/hadoop/common/stable/hadoop-${HADOOP_VERSION}.tar.gz

sql/codegen_xgboost.go

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -47,19 +47,32 @@ type xgLearningFields struct {
4747
}
4848

4949
type xgBoosterFields struct {
50-
Objective string `json:"objective,omitempty"`
51-
Booster string `json:"booster,omitempty"`
52-
NumClass uint `json:"num_class,omitempty"`
53-
MaxDepth uint `json:"max_depth,omitempty"`
54-
Eta float32 `json:"eta,omitempty"`
50+
Objective string `json:"objective,omitempty"`
51+
EvalMetric string `json:"eval_metric,omitempty"`
52+
Booster string `json:"booster,omitempty"`
53+
Seed uint `json:"seed,omitempty"`
54+
NumClass uint `json:"num_class,omitempty"`
55+
Eta float32 `json:"eta,omitempty"`
56+
Gamma float32 `json:"gamma,omitempy"`
57+
MaxDepth uint `json:"max_depth,omitempty"`
58+
MinChildWeight uint `json:"min_child_weight,omitempty"`
59+
Subsample float32 `json:"subsample,omtiempty"`
60+
ColSampleByTree float32 `json:"colsample_bytree,omitempty"`
61+
ColSampleByLevel float32 `json:"colsample_bylevel,omitempty"`
62+
ColSampleByNode float32 `json:"colsample_bynode,omitempty"`
63+
// `Lambda` is reversed in python, so we use alias reg_lambda.
64+
Lambda float32 `json:"reg_lambda,omitempty"`
65+
// We use alias `reg_alpha` to keep align with `reg_lambda`。
66+
Alpha float32 `json:"reg_alpha,omitempty"`
5567
TreeMethod string `json:"tree_method,omitempty"`
56-
EvalMetric string `json:"eval_metric,omitempty"`
57-
Subsample float32 `json:"subsample,omitempty"`
58-
ColSampleByTree float32 `json:"colsample_bytree,omitempty"`
59-
ColSampleByLevel float32 `json:"colsample_bylevel,omitempty"`
68+
SketchEps float32 `json:"sketch_eps,omitempty"`
69+
ScalePosWeight float32 `json:"scale_pos_weight,omitempty"`
70+
GrowPolicy string `json:"grow_policy,omitempty"`
71+
MaxLeaves uint `json:"max_leaves,omitempty"`
6072
MaxBin uint `json:"max_bin,omitempty"`
61-
ConvergenceCriteria string `json:"convergence_criteria,omitempty"`
62-
Verbosity uint `json:"verbosity,omitempty"`
73+
NumParallelTree uint `json:"num_parallel_tree,omitempty"`
74+
ConvergenceCriteria string `json:"convergence_criteria,omitempty"` // auto_train config
75+
Verbosity uint `json:"verbosity,omitempty"` // auto_train config
6376
}
6477

6578
type xgColumnFields struct {
@@ -248,16 +261,27 @@ func sListPartial(key string, ptrFn func(*xgboostFiller) *[]string) func(*map[st
248261
var xgbTrainAttrSetterMap = map[string]func(*map[string][]string, *xgboostFiller) error{
249262
// booster params
250263
"train.objective": strPartial("train.objective", func(r *xgboostFiller) *string { return &(r.Objective) }),
264+
"train.eval_metric": strPartial("train.eval_metric", func(r *xgboostFiller) *string { return &(r.EvalMetric) }),
251265
"train.booster": strPartial("train.booster", func(r *xgboostFiller) *string { return &(r.Booster) }),
252-
"train.max_depth": uIntPartial("train.max_depth", func(r *xgboostFiller) *uint { return &(r.MaxDepth) }),
266+
"train.seed": uIntPartial("train.seed", func(r *xgboostFiller) *uint { return &(r.Seed) }),
253267
"train.num_class": uIntPartial("train.num_class", func(r *xgboostFiller) *uint { return &(r.NumClass) }),
254268
"train.eta": fp32Partial("train.eta", func(r *xgboostFiller) *float32 { return &(r.Eta) }),
255-
"train.tree_method": strPartial("train.tree_method", func(r *xgboostFiller) *string { return &(r.TreeMethod) }),
256-
"train.eval_metric": strPartial("train.eval_metric", func(r *xgboostFiller) *string { return &(r.EvalMetric) }),
269+
"train.gamma": fp32Partial("train.gamma", func(r *xgboostFiller) *float32 { return &(r.Gamma) }),
270+
"train.max_depth": uIntPartial("train.max_depth", func(r *xgboostFiller) *uint { return &(r.MaxDepth) }),
271+
"train.min_child_weight": uIntPartial("train.min_child_weight", func(r *xgboostFiller) *uint { return &(r.MinChildWeight) }),
257272
"train.subsample": fp32Partial("train.subsample", func(r *xgboostFiller) *float32 { return &(r.Subsample) }),
258273
"train.colsample_bytree": fp32Partial("train.colsample_bytree", func(r *xgboostFiller) *float32 { return &(r.ColSampleByTree) }),
259274
"train.colsample_bylevel": fp32Partial("train.colsample_bylevel", func(r *xgboostFiller) *float32 { return &(r.ColSampleByLevel) }),
275+
"train.colsample_bynode": fp32Partial("train.colsample_bynode", func(r *xgboostFiller) *float32 { return &(r.ColSampleByNode) }),
276+
"train.lambda": fp32Partial("train.lambda", func(r *xgboostFiller) *float32 { return &(r.Lambda) }),
277+
"train.alpha": fp32Partial("train.alpha", func(r *xgboostFiller) *float32 { return &(r.Alpha) }),
278+
"train.tree_method": strPartial("train.tree_method", func(r *xgboostFiller) *string { return &(r.TreeMethod) }),
279+
"train.sketch_eps": fp32Partial("train.sketch_eps", func(r *xgboostFiller) *float32 { return &(r.SketchEps) }),
280+
"train.scale_pos_weight": fp32Partial("train.scale_pos_weight", func(r *xgboostFiller) *float32 { return &(r.ScalePosWeight) }),
281+
"train.grow_policy": strPartial("train.grow_policy", func(r *xgboostFiller) *string { return &(r.GrowPolicy) }),
282+
"train.max_leaves": uIntPartial("train.max_leaves", func(r *xgboostFiller) *uint { return &(r.MaxLeaves) }),
260283
"train.max_bin": uIntPartial("train.max_bin", func(r *xgboostFiller) *uint { return &(r.MaxBin) }),
284+
"train.num_parallel_tree": uIntPartial("train.num_parallel_tree", func(r *xgboostFiller) *uint { return &(r.NumParallelTree) }),
261285
"train.convergence_criteria": strPartial("train.convergence_criteria", func(r *xgboostFiller) *string { return &(r.ConvergenceCriteria) }),
262286
"train.verbosity": uIntPartial("train.verbosity", func(r *xgboostFiller) *uint { return &(r.Verbosity) }),
263287
// xgboost train controllers

sql/codegen_xgboost_test.go

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,15 @@ func TestPartials(t *testing.T) {
7272
e = part(&tmpMap, filler)
7373
a.Error(e)
7474
// Error: len(val) > 1
75-
tmpMap["obj"] = []string{"binary:logistic", "reg:linear"}
75+
tmpMap["obj"] = []string{"binary:logistic", "reg:squarederror"}
7676
e = part(&tmpMap, filler)
7777
a.Error(e)
78-
// change objective to "reg:linear"
79-
tmpMap["obj"] = []string{"reg:linear"}
78+
// change objective to "reg:squarederror"
79+
tmpMap["obj"] = []string{"reg:squarederror"}
8080
filler.Objective = ""
8181
e = part(&tmpMap, filler)
8282
a.NoError(e)
83-
a.Equal(filler.Objective, "reg:linear")
83+
a.Equal(filler.Objective, "reg:squarederror")
8484

8585
// test uIntPartial
8686
part = uIntPartial("num_class", func(r *xgboostFiller) *uint { return &(r.NumClass) })
@@ -145,17 +145,29 @@ SELECT a, b, c, d, e FROM table_xx
145145
TRAIN xgboost.Estimator
146146
WITH
147147
train.objective = "binary:logistic",
148-
train.booster = gblinear,
148+
train.eval_metric = auc,
149+
train.booster = gbtree,
150+
train.seed = 1000,
149151
train.num_class = 2,
150-
train.max_depth = 5,
151152
train.eta = 0.03,
152-
train.tree_method = hist,
153+
train.gamma = 0.01,
154+
train.max_depth = 5,
155+
train.min_child_weight = 10,
153156
train.subsample = 0.8,
154157
train.colsample_bytree = 0.5,
155158
train.colsample_bylevel = 0.6,
159+
train.colsample_bynode = 0.4,
160+
train.lambda = 0.001,
161+
train.alpha = 0.01,
162+
train.tree_method = hist,
163+
train.sketch_eps = 0.03,
164+
train.scale_pos_weight = 1,
165+
train.grow_policy = lossguide,
166+
train.max_leaves = 64,
156167
train.max_bin = 128,
157168
train.verbosity = 3,
158169
train.num_round = 30,
170+
train.convergence_criteria = "10:200:0.8",
159171
train.auto_train = true
160172
COLUMN a, b, c, d
161173
LABEL e INTO table_123;
@@ -169,16 +181,28 @@ LABEL e INTO table_123;
169181
params, _ := mapData["params"]
170182
paramMap, _ := params.(map[string]interface{})
171183
assertEq(paramMap, "objective", "binary:logistic")
172-
assertEq(paramMap, "booster", "gblinear")
184+
assertEq(paramMap, "eval_metric", "auc")
185+
assertEq(paramMap, "booster", "gbtree")
186+
assertEq(paramMap, "seed", 1000)
173187
assertEq(paramMap, "num_class", 2)
174-
assertEq(paramMap, "max_depth", 5)
175188
assertEq(paramMap, "eta", 0.03)
176-
assertEq(paramMap, "tree_method", "hist")
189+
assertEq(paramMap, "gamma", 0.01)
190+
assertEq(paramMap, "max_depth", 5)
191+
assertEq(paramMap, "min_child_weight", 10)
177192
assertEq(paramMap, "subsample", 0.8)
178193
assertEq(paramMap, "colsample_bytree", 0.5)
179194
assertEq(paramMap, "colsample_bylevel", 0.6)
195+
assertEq(paramMap, "colsample_bynode", 0.4)
196+
assertEq(paramMap, "reg_lambda", 0.001)
197+
assertEq(paramMap, "reg_alpha", 0.01)
198+
assertEq(paramMap, "tree_method", "hist")
199+
assertEq(paramMap, "sketch_eps", 0.03)
200+
assertEq(paramMap, "scale_pos_weight", 1)
201+
assertEq(paramMap, "grow_policy", "lossguide")
202+
assertEq(paramMap, "max_leaves", 64)
180203
assertEq(paramMap, "max_bin", 128)
181204
assertEq(paramMap, "verbosity", 3)
205+
assertEq(paramMap, "convergence_criteria", "10:200:0.8")
182206
assertEq(mapData, "num_boost_round", 30)
183207
assertEq(mapData, "auto_train", true)
184208

0 commit comments

Comments
 (0)