Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/image_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ echo 'get_ipython().magic(u"%autoreload 2")' >> $IPYTHON_STARTUP/00-first.py
curl https://raw.githubusercontent.com/sql-machine-learning/sqlflow/develop/example/jupyter/example.ipynb --output /workspace/example.ipynb

# 9. install xgboost-launcher
pip install xgboost-launcher==0.0.3
pip install xgboost-launcher==0.0.4

# 10. install Hadoop to use as the client when writing CSV to hive tables
HADOOP_URL=https://archive.apache.org/dist/hadoop/common/stable/hadoop-${HADOOP_VERSION}.tar.gz
Expand Down
52 changes: 38 additions & 14 deletions sql/codegen_xgboost.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,32 @@ type xgLearningFields struct {
}

type xgBoosterFields struct {
Objective string `json:"objective,omitempty"`
Booster string `json:"booster,omitempty"`
NumClass uint `json:"num_class,omitempty"`
MaxDepth uint `json:"max_depth,omitempty"`
Eta float32 `json:"eta,omitempty"`
Objective string `json:"objective,omitempty"`
EvalMetric string `json:"eval_metric,omitempty"`
Booster string `json:"booster,omitempty"`
Seed uint `json:"seed,omitempty"`
NumClass uint `json:"num_class,omitempty"`
Eta float32 `json:"eta,omitempty"`
Gamma float32 `json:"gamma,omitempy"`
MaxDepth uint `json:"max_depth,omitempty"`
MinChildWeight uint `json:"min_child_weight,omitempty"`
Subsample float32 `json:"subsample,omtiempty"`
ColSampleByTree float32 `json:"colsample_bytree,omitempty"`
ColSampleByLevel float32 `json:"colsample_bylevel,omitempty"`
ColSampleByNode float32 `json:"colsample_bynode,omitempty"`
// `Lambda` is reversed in python, so we use alias reg_lambda.
Lambda float32 `json:"reg_lambda,omitempty"`
// We use alias `reg_alpha` to keep align with `reg_lambda`。
Alpha float32 `json:"reg_alpha,omitempty"`
TreeMethod string `json:"tree_method,omitempty"`
EvalMetric string `json:"eval_metric,omitempty"`
Subsample float32 `json:"subsample,omitempty"`
ColSampleByTree float32 `json:"colsample_bytree,omitempty"`
ColSampleByLevel float32 `json:"colsample_bylevel,omitempty"`
SketchEps float32 `json:"sketch_eps,omitempty"`
ScalePosWeight float32 `json:"scale_pos_weight,omitempty"`
GrowPolicy string `json:"grow_policy,omitempty"`
MaxLeaves uint `json:"max_leaves,omitempty"`
MaxBin uint `json:"max_bin,omitempty"`
ConvergenceCriteria string `json:"convergence_criteria,omitempty"`
Verbosity uint `json:"verbosity,omitempty"`
NumParallelTree uint `json:"num_parallel_tree,omitempty"`
ConvergenceCriteria string `json:"convergence_criteria,omitempty"` // auto_train config
Verbosity uint `json:"verbosity,omitempty"` // auto_train config
}

type xgColumnFields struct {
Expand Down Expand Up @@ -248,16 +261,27 @@ func sListPartial(key string, ptrFn func(*xgboostFiller) *[]string) func(*map[st
var xgbTrainAttrSetterMap = map[string]func(*map[string][]string, *xgboostFiller) error{
// booster params
"train.objective": strPartial("train.objective", func(r *xgboostFiller) *string { return &(r.Objective) }),
"train.eval_metric": strPartial("train.eval_metric", func(r *xgboostFiller) *string { return &(r.EvalMetric) }),
"train.booster": strPartial("train.booster", func(r *xgboostFiller) *string { return &(r.Booster) }),
"train.max_depth": uIntPartial("train.max_depth", func(r *xgboostFiller) *uint { return &(r.MaxDepth) }),
"train.seed": uIntPartial("train.seed", func(r *xgboostFiller) *uint { return &(r.Seed) }),
"train.num_class": uIntPartial("train.num_class", func(r *xgboostFiller) *uint { return &(r.NumClass) }),
"train.eta": fp32Partial("train.eta", func(r *xgboostFiller) *float32 { return &(r.Eta) }),
"train.tree_method": strPartial("train.tree_method", func(r *xgboostFiller) *string { return &(r.TreeMethod) }),
"train.eval_metric": strPartial("train.eval_metric", func(r *xgboostFiller) *string { return &(r.EvalMetric) }),
"train.gamma": fp32Partial("train.gamma", func(r *xgboostFiller) *float32 { return &(r.Gamma) }),
"train.max_depth": uIntPartial("train.max_depth", func(r *xgboostFiller) *uint { return &(r.MaxDepth) }),
"train.min_child_weight": uIntPartial("train.min_child_weight", func(r *xgboostFiller) *uint { return &(r.MinChildWeight) }),
"train.subsample": fp32Partial("train.subsample", func(r *xgboostFiller) *float32 { return &(r.Subsample) }),
"train.colsample_bytree": fp32Partial("train.colsample_bytree", func(r *xgboostFiller) *float32 { return &(r.ColSampleByTree) }),
"train.colsample_bylevel": fp32Partial("train.colsample_bylevel", func(r *xgboostFiller) *float32 { return &(r.ColSampleByLevel) }),
"train.colsample_bynode": fp32Partial("train.colsample_bynode", func(r *xgboostFiller) *float32 { return &(r.ColSampleByNode) }),
"train.lambda": fp32Partial("train.lambda", func(r *xgboostFiller) *float32 { return &(r.Lambda) }),
"train.alpha": fp32Partial("train.alpha", func(r *xgboostFiller) *float32 { return &(r.Alpha) }),
"train.tree_method": strPartial("train.tree_method", func(r *xgboostFiller) *string { return &(r.TreeMethod) }),
"train.sketch_eps": fp32Partial("train.sketch_eps", func(r *xgboostFiller) *float32 { return &(r.SketchEps) }),
"train.scale_pos_weight": fp32Partial("train.scale_pos_weight", func(r *xgboostFiller) *float32 { return &(r.ScalePosWeight) }),
"train.grow_policy": strPartial("train.grow_policy", func(r *xgboostFiller) *string { return &(r.GrowPolicy) }),
"train.max_leaves": uIntPartial("train.max_leaves", func(r *xgboostFiller) *uint { return &(r.MaxLeaves) }),
"train.max_bin": uIntPartial("train.max_bin", func(r *xgboostFiller) *uint { return &(r.MaxBin) }),
"train.num_parallel_tree": uIntPartial("train.num_parallel_tree", func(r *xgboostFiller) *uint { return &(r.NumParallelTree) }),
"train.convergence_criteria": strPartial("train.convergence_criteria", func(r *xgboostFiller) *string { return &(r.ConvergenceCriteria) }),
"train.verbosity": uIntPartial("train.verbosity", func(r *xgboostFiller) *uint { return &(r.Verbosity) }),
// xgboost train controllers
Expand Down
44 changes: 34 additions & 10 deletions sql/codegen_xgboost_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,15 @@ func TestPartials(t *testing.T) {
e = part(&tmpMap, filler)
a.Error(e)
// Error: len(val) > 1
tmpMap["obj"] = []string{"binary:logistic", "reg:linear"}
tmpMap["obj"] = []string{"binary:logistic", "reg:squarederror"}
e = part(&tmpMap, filler)
a.Error(e)
// change objective to "reg:linear"
tmpMap["obj"] = []string{"reg:linear"}
// change objective to "reg:squarederror"
tmpMap["obj"] = []string{"reg:squarederror"}
filler.Objective = ""
e = part(&tmpMap, filler)
a.NoError(e)
a.Equal(filler.Objective, "reg:linear")
a.Equal(filler.Objective, "reg:squarederror")

// test uIntPartial
part = uIntPartial("num_class", func(r *xgboostFiller) *uint { return &(r.NumClass) })
Expand Down Expand Up @@ -140,17 +140,29 @@ SELECT a, b, c, d, e FROM table_xx
TRAIN xgboost.Estimator
WITH
train.objective = "binary:logistic",
train.booster = gblinear,
train.eval_metric = auc,
train.booster = gbtree,
train.seed = 1000,
train.num_class = 2,
train.max_depth = 5,
train.eta = 0.03,
train.tree_method = hist,
train.gamma = 0.01,
train.max_depth = 5,
train.min_child_weight = 10,
train.subsample = 0.8,
train.colsample_bytree = 0.5,
train.colsample_bylevel = 0.6,
train.colsample_bynode = 0.4,
train.lambda = 0.001,
train.alpha = 0.01,
train.tree_method = hist,
train.sketch_eps = 0.03,
train.scale_pos_weight = 1,
train.grow_policy = lossguide,
train.max_leaves = 64,
train.max_bin = 128,
train.verbosity = 3,
train.num_round = 30,
train.convergence_criteria = "10:200:0.8",
train.auto_train = true
COLUMN a, b, c, d
LABEL e INTO table_123;
Expand All @@ -164,16 +176,28 @@ LABEL e INTO table_123;
params, _ := mapData["params"]
paramMap, _ := params.(map[string]interface{})
assertEq(paramMap, "objective", "binary:logistic")
assertEq(paramMap, "booster", "gblinear")
assertEq(paramMap, "eval_metric", "auc")
assertEq(paramMap, "booster", "gbtree")
assertEq(paramMap, "seed", 1000)
assertEq(paramMap, "num_class", 2)
assertEq(paramMap, "max_depth", 5)
assertEq(paramMap, "eta", 0.03)
assertEq(paramMap, "tree_method", "hist")
assertEq(paramMap, "gamma", 0.01)
assertEq(paramMap, "max_depth", 5)
assertEq(paramMap, "min_child_weight", 10)
assertEq(paramMap, "subsample", 0.8)
assertEq(paramMap, "colsample_bytree", 0.5)
assertEq(paramMap, "colsample_bylevel", 0.6)
assertEq(paramMap, "colsample_bynode", 0.4)
assertEq(paramMap, "reg_lambda", 0.001)
assertEq(paramMap, "reg_alpha", 0.01)
assertEq(paramMap, "tree_method", "hist")
assertEq(paramMap, "sketch_eps", 0.03)
assertEq(paramMap, "scale_pos_weight", 1)
assertEq(paramMap, "grow_policy", "lossguide")
assertEq(paramMap, "max_leaves", 64)
assertEq(paramMap, "max_bin", 128)
assertEq(paramMap, "verbosity", 3)
assertEq(paramMap, "convergence_criteria", "10:200:0.8")
assertEq(mapData, "num_boost_round", 30)
assertEq(mapData, "auto_train", true)

Expand Down