From d1724c282bb03fc757571dfbdd6116ff472b1114 Mon Sep 17 00:00:00 2001 From: sperlingxx Date: Mon, 2 Sep 2019 16:20:26 +0800 Subject: [PATCH] add more available xgboost learning params --- scripts/image_build.sh | 2 +- sql/codegen_xgboost.go | 52 +++++++++++++++++++++++++++---------- sql/codegen_xgboost_test.go | 44 ++++++++++++++++++++++++------- 3 files changed, 73 insertions(+), 25 deletions(-) diff --git a/scripts/image_build.sh b/scripts/image_build.sh index f1830621fc..88f829d6db 100644 --- a/scripts/image_build.sh +++ b/scripts/image_build.sh @@ -115,7 +115,7 @@ echo 'get_ipython().magic(u"%autoreload 2")' >> $IPYTHON_STARTUP/00-first.py curl https://raw.githubusercontent.com/sql-machine-learning/sqlflow/develop/example/jupyter/example.ipynb --output /workspace/example.ipynb # 9. install xgboost-launcher -pip install xgboost-launcher==0.0.3 +pip install xgboost-launcher==0.0.4 # 10. install Hadoop to use as the client when writing CSV to hive tables HADOOP_URL=https://archive.apache.org/dist/hadoop/common/stable/hadoop-${HADOOP_VERSION}.tar.gz diff --git a/sql/codegen_xgboost.go b/sql/codegen_xgboost.go index 72a6e7a794..48fe2033e3 100644 --- a/sql/codegen_xgboost.go +++ b/sql/codegen_xgboost.go @@ -47,19 +47,32 @@ type xgLearningFields struct { } type xgBoosterFields struct { - Objective string `json:"objective,omitempty"` - Booster string `json:"booster,omitempty"` - NumClass uint `json:"num_class,omitempty"` - MaxDepth uint `json:"max_depth,omitempty"` - Eta float32 `json:"eta,omitempty"` + Objective string `json:"objective,omitempty"` + EvalMetric string `json:"eval_metric,omitempty"` + Booster string `json:"booster,omitempty"` + Seed uint `json:"seed,omitempty"` + NumClass uint `json:"num_class,omitempty"` + Eta float32 `json:"eta,omitempty"` + Gamma float32 `json:"gamma,omitempy"` + MaxDepth uint `json:"max_depth,omitempty"` + MinChildWeight uint `json:"min_child_weight,omitempty"` + Subsample float32 `json:"subsample,omtiempty"` + ColSampleByTree float32 `json:"colsample_bytree,omitempty"` + ColSampleByLevel float32 `json:"colsample_bylevel,omitempty"` + ColSampleByNode float32 `json:"colsample_bynode,omitempty"` + // `Lambda` is reversed in python, so we use alias reg_lambda. + Lambda float32 `json:"reg_lambda,omitempty"` + // We use alias `reg_alpha` to keep align with `reg_lambda`。 + Alpha float32 `json:"reg_alpha,omitempty"` TreeMethod string `json:"tree_method,omitempty"` - EvalMetric string `json:"eval_metric,omitempty"` - Subsample float32 `json:"subsample,omitempty"` - ColSampleByTree float32 `json:"colsample_bytree,omitempty"` - ColSampleByLevel float32 `json:"colsample_bylevel,omitempty"` + SketchEps float32 `json:"sketch_eps,omitempty"` + ScalePosWeight float32 `json:"scale_pos_weight,omitempty"` + GrowPolicy string `json:"grow_policy,omitempty"` + MaxLeaves uint `json:"max_leaves,omitempty"` MaxBin uint `json:"max_bin,omitempty"` - ConvergenceCriteria string `json:"convergence_criteria,omitempty"` - Verbosity uint `json:"verbosity,omitempty"` + NumParallelTree uint `json:"num_parallel_tree,omitempty"` + ConvergenceCriteria string `json:"convergence_criteria,omitempty"` // auto_train config + Verbosity uint `json:"verbosity,omitempty"` // auto_train config } type xgColumnFields struct { @@ -248,16 +261,27 @@ func sListPartial(key string, ptrFn func(*xgboostFiller) *[]string) func(*map[st var xgbTrainAttrSetterMap = map[string]func(*map[string][]string, *xgboostFiller) error{ // booster params "train.objective": strPartial("train.objective", func(r *xgboostFiller) *string { return &(r.Objective) }), + "train.eval_metric": strPartial("train.eval_metric", func(r *xgboostFiller) *string { return &(r.EvalMetric) }), "train.booster": strPartial("train.booster", func(r *xgboostFiller) *string { return &(r.Booster) }), - "train.max_depth": uIntPartial("train.max_depth", func(r *xgboostFiller) *uint { return &(r.MaxDepth) }), + "train.seed": uIntPartial("train.seed", func(r *xgboostFiller) *uint { return &(r.Seed) }), "train.num_class": uIntPartial("train.num_class", func(r *xgboostFiller) *uint { return &(r.NumClass) }), "train.eta": fp32Partial("train.eta", func(r *xgboostFiller) *float32 { return &(r.Eta) }), - "train.tree_method": strPartial("train.tree_method", func(r *xgboostFiller) *string { return &(r.TreeMethod) }), - "train.eval_metric": strPartial("train.eval_metric", func(r *xgboostFiller) *string { return &(r.EvalMetric) }), + "train.gamma": fp32Partial("train.gamma", func(r *xgboostFiller) *float32 { return &(r.Gamma) }), + "train.max_depth": uIntPartial("train.max_depth", func(r *xgboostFiller) *uint { return &(r.MaxDepth) }), + "train.min_child_weight": uIntPartial("train.min_child_weight", func(r *xgboostFiller) *uint { return &(r.MinChildWeight) }), "train.subsample": fp32Partial("train.subsample", func(r *xgboostFiller) *float32 { return &(r.Subsample) }), "train.colsample_bytree": fp32Partial("train.colsample_bytree", func(r *xgboostFiller) *float32 { return &(r.ColSampleByTree) }), "train.colsample_bylevel": fp32Partial("train.colsample_bylevel", func(r *xgboostFiller) *float32 { return &(r.ColSampleByLevel) }), + "train.colsample_bynode": fp32Partial("train.colsample_bynode", func(r *xgboostFiller) *float32 { return &(r.ColSampleByNode) }), + "train.lambda": fp32Partial("train.lambda", func(r *xgboostFiller) *float32 { return &(r.Lambda) }), + "train.alpha": fp32Partial("train.alpha", func(r *xgboostFiller) *float32 { return &(r.Alpha) }), + "train.tree_method": strPartial("train.tree_method", func(r *xgboostFiller) *string { return &(r.TreeMethod) }), + "train.sketch_eps": fp32Partial("train.sketch_eps", func(r *xgboostFiller) *float32 { return &(r.SketchEps) }), + "train.scale_pos_weight": fp32Partial("train.scale_pos_weight", func(r *xgboostFiller) *float32 { return &(r.ScalePosWeight) }), + "train.grow_policy": strPartial("train.grow_policy", func(r *xgboostFiller) *string { return &(r.GrowPolicy) }), + "train.max_leaves": uIntPartial("train.max_leaves", func(r *xgboostFiller) *uint { return &(r.MaxLeaves) }), "train.max_bin": uIntPartial("train.max_bin", func(r *xgboostFiller) *uint { return &(r.MaxBin) }), + "train.num_parallel_tree": uIntPartial("train.num_parallel_tree", func(r *xgboostFiller) *uint { return &(r.NumParallelTree) }), "train.convergence_criteria": strPartial("train.convergence_criteria", func(r *xgboostFiller) *string { return &(r.ConvergenceCriteria) }), "train.verbosity": uIntPartial("train.verbosity", func(r *xgboostFiller) *uint { return &(r.Verbosity) }), // xgboost train controllers diff --git a/sql/codegen_xgboost_test.go b/sql/codegen_xgboost_test.go index bacec49ee9..40c25a6983 100644 --- a/sql/codegen_xgboost_test.go +++ b/sql/codegen_xgboost_test.go @@ -67,15 +67,15 @@ func TestPartials(t *testing.T) { e = part(&tmpMap, filler) a.Error(e) // Error: len(val) > 1 - tmpMap["obj"] = []string{"binary:logistic", "reg:linear"} + tmpMap["obj"] = []string{"binary:logistic", "reg:squarederror"} e = part(&tmpMap, filler) a.Error(e) - // change objective to "reg:linear" - tmpMap["obj"] = []string{"reg:linear"} + // change objective to "reg:squarederror" + tmpMap["obj"] = []string{"reg:squarederror"} filler.Objective = "" e = part(&tmpMap, filler) a.NoError(e) - a.Equal(filler.Objective, "reg:linear") + a.Equal(filler.Objective, "reg:squarederror") // test uIntPartial part = uIntPartial("num_class", func(r *xgboostFiller) *uint { return &(r.NumClass) }) @@ -140,17 +140,29 @@ SELECT a, b, c, d, e FROM table_xx TRAIN xgboost.Estimator WITH train.objective = "binary:logistic", - train.booster = gblinear, + train.eval_metric = auc, + train.booster = gbtree, + train.seed = 1000, train.num_class = 2, - train.max_depth = 5, train.eta = 0.03, - train.tree_method = hist, + train.gamma = 0.01, + train.max_depth = 5, + train.min_child_weight = 10, train.subsample = 0.8, train.colsample_bytree = 0.5, train.colsample_bylevel = 0.6, + train.colsample_bynode = 0.4, + train.lambda = 0.001, + train.alpha = 0.01, + train.tree_method = hist, + train.sketch_eps = 0.03, + train.scale_pos_weight = 1, + train.grow_policy = lossguide, + train.max_leaves = 64, train.max_bin = 128, train.verbosity = 3, train.num_round = 30, + train.convergence_criteria = "10:200:0.8", train.auto_train = true COLUMN a, b, c, d LABEL e INTO table_123; @@ -164,16 +176,28 @@ LABEL e INTO table_123; params, _ := mapData["params"] paramMap, _ := params.(map[string]interface{}) assertEq(paramMap, "objective", "binary:logistic") - assertEq(paramMap, "booster", "gblinear") + assertEq(paramMap, "eval_metric", "auc") + assertEq(paramMap, "booster", "gbtree") + assertEq(paramMap, "seed", 1000) assertEq(paramMap, "num_class", 2) - assertEq(paramMap, "max_depth", 5) assertEq(paramMap, "eta", 0.03) - assertEq(paramMap, "tree_method", "hist") + assertEq(paramMap, "gamma", 0.01) + assertEq(paramMap, "max_depth", 5) + assertEq(paramMap, "min_child_weight", 10) assertEq(paramMap, "subsample", 0.8) assertEq(paramMap, "colsample_bytree", 0.5) assertEq(paramMap, "colsample_bylevel", 0.6) + assertEq(paramMap, "colsample_bynode", 0.4) + assertEq(paramMap, "reg_lambda", 0.001) + assertEq(paramMap, "reg_alpha", 0.01) + assertEq(paramMap, "tree_method", "hist") + assertEq(paramMap, "sketch_eps", 0.03) + assertEq(paramMap, "scale_pos_weight", 1) + assertEq(paramMap, "grow_policy", "lossguide") + assertEq(paramMap, "max_leaves", 64) assertEq(paramMap, "max_bin", 128) assertEq(paramMap, "verbosity", 3) + assertEq(paramMap, "convergence_criteria", "10:200:0.8") assertEq(mapData, "num_boost_round", 30) assertEq(mapData, "auto_train", true)