diff --git a/doc/xgboost_on_sqlflow_design.md b/doc/ant-xgboost_on_sqlflow_design.md similarity index 97% rename from doc/xgboost_on_sqlflow_design.md rename to doc/ant-xgboost_on_sqlflow_design.md index b8bdd9d0b1..c7b8c6eb94 100644 --- a/doc/xgboost_on_sqlflow_design.md +++ b/doc/ant-xgboost_on_sqlflow_design.md @@ -1,8 +1,8 @@ -# _Design:_ xgboost on sqlflow +# _Design:_ ant-xgboost on sqlflow ## Overview -This is a design doc about why and how to support running xgboost via sqlflow as a machine learning estimator. +This is a design doc about why and how to support running ant-xgboost via sqlflow as a machine learning estimator. We propose to build a lightweight python template for xgboost on basis of `xgblauncher`, an incubating xgboost wrapper in [ant-xgboost](https://github.com/alipay/ant-xgboost). diff --git a/scripts/test_e2e.sh b/scripts/test_e2e.sh index 194fa4a11a..47646d1202 100644 --- a/scripts/test_e2e.sh +++ b/scripts/test_e2e.sh @@ -48,8 +48,8 @@ export PYTHONPATH=$GOPATH/src/github.com/sql-machine-learning/sqlflow/sql/python sqlflowserver --datasource=${DATASOURCE} & # e2e test for standard SQL SQLFLOW_SERVER=localhost:50051 ipython sql/python/test_magic.py -# e2e test for xgboost train and prediciton SQL. -SQLFLOW_SERVER=localhost:50051 ipython sql/python/test_magic_xgboost.py +# e2e test for ant-xgboost train and prediciton SQL. +SQLFLOW_SERVER=localhost:50051 ipython sql/python/test_magic_ant_xgboost.py # TODO(terrytangyuan): Enable this when ElasticDL is open sourced # e2e test for ElasticDL SQL # export SQLFLOW_submitter=elasticdl diff --git a/sql/codegen_analyze.go b/sql/codegen_analyze.go index a73c44015a..bd609c023a 100644 --- a/sql/codegen_analyze.go +++ b/sql/codegen_analyze.go @@ -41,7 +41,7 @@ func readFeatureNames(pr *extendedSelect, db *DB) ([]string, string, error) { if strings.HasPrefix(strings.ToUpper(pr.estimator), `XGBOOST.`) { // TODO(weiguo): It's a quick way to read column and label names from // xgboost.*, but too heavy. - xgbFiller, err := newXGBoostFiller(pr, nil, db) + xgbFiller, err := newAntXGBoostFiller(pr, nil, db) if err != nil { return nil, "", err } diff --git a/sql/codegen_xgboost.go b/sql/codegen_ant_xgboost.go similarity index 85% rename from sql/codegen_xgboost.go rename to sql/codegen_ant_xgboost.go index 0dd54000d0..52c57a516e 100644 --- a/sql/codegen_xgboost.go +++ b/sql/codegen_ant_xgboost.go @@ -28,7 +28,7 @@ import ( "sqlflow.org/gomaxcompute" ) -type xgboostFiller struct { +type antXGBoostFiller struct { ModelPath string xgLearningFields xgColumnFields @@ -164,15 +164,15 @@ func xgMultiSparseError(colNames []string) error { } func xgUnknownFCError(kw string) error { - return fmt.Errorf("xgUnknownFCError: feature column keyword(`%s`) is not supported by xgboost engine", kw) + return fmt.Errorf("xgUnknownFCError: feature column keyword(`%s`) is not supported by ant-xgboost engine", kw) } func xgUnsupportedColTagError() error { - return fmt.Errorf("xgUnsupportedColTagError: valid column tags of xgboost engine([feature_columns, group, weight])") + return fmt.Errorf("xgUnsupportedColTagError: valid column tags of ant-xgboost engine([feature_columns, group, weight])") } -func uIntPartial(key string, ptrFn func(*xgboostFiller) *uint) func(*map[string][]string, *xgboostFiller) error { - return func(a *map[string][]string, r *xgboostFiller) error { +func uIntPartial(key string, ptrFn func(*antXGBoostFiller) *uint) func(*map[string][]string, *antXGBoostFiller) error { + return func(a *map[string][]string, r *antXGBoostFiller) error { // xgParseAttr will ensure the key is existing in map val, _ := (*a)[key] if len(val) != 1 { @@ -190,8 +190,8 @@ func uIntPartial(key string, ptrFn func(*xgboostFiller) *uint) func(*map[string] } } -func fp32Partial(key string, ptrFn func(*xgboostFiller) *float32) func(*map[string][]string, *xgboostFiller) error { - return func(a *map[string][]string, r *xgboostFiller) error { +func fp32Partial(key string, ptrFn func(*antXGBoostFiller) *float32) func(*map[string][]string, *antXGBoostFiller) error { + return func(a *map[string][]string, r *antXGBoostFiller) error { // xgParseAttr will ensure the key is existing in map val, _ := (*a)[key] if len(val) != 1 { @@ -209,8 +209,8 @@ func fp32Partial(key string, ptrFn func(*xgboostFiller) *float32) func(*map[stri } } -func boolPartial(key string, ptrFn func(*xgboostFiller) *bool) func(*map[string][]string, *xgboostFiller) error { - return func(a *map[string][]string, r *xgboostFiller) error { +func boolPartial(key string, ptrFn func(*antXGBoostFiller) *bool) func(*map[string][]string, *antXGBoostFiller) error { + return func(a *map[string][]string, r *antXGBoostFiller) error { // xgParseAttr will ensure the key is existing in map val, _ := (*a)[key] if len(val) != 1 { @@ -227,8 +227,8 @@ func boolPartial(key string, ptrFn func(*xgboostFiller) *bool) func(*map[string] } } -func strPartial(key string, ptrFn func(*xgboostFiller) *string) func(*map[string][]string, *xgboostFiller) error { - return func(a *map[string][]string, r *xgboostFiller) error { +func strPartial(key string, ptrFn func(*antXGBoostFiller) *string) func(*map[string][]string, *antXGBoostFiller) error { + return func(a *map[string][]string, r *antXGBoostFiller) error { // xgParseAttr will ensure the key is existing in map val, _ := (*a)[key] if len(val) != 1 { @@ -244,8 +244,8 @@ func strPartial(key string, ptrFn func(*xgboostFiller) *string) func(*map[string } } -func sListPartial(key string, ptrFn func(*xgboostFiller) *[]string) func(*map[string][]string, *xgboostFiller) error { - return func(a *map[string][]string, r *xgboostFiller) error { +func sListPartial(key string, ptrFn func(*antXGBoostFiller) *[]string) func(*map[string][]string, *antXGBoostFiller) error { + return func(a *map[string][]string, r *antXGBoostFiller) error { // xgParseAttr will ensure the key is existing in map val, _ := (*a)[key] strListPtr := ptrFn(r) @@ -258,48 +258,48 @@ func sListPartial(key string, ptrFn func(*xgboostFiller) *[]string) func(*map[st } } -var xgbTrainAttrSetterMap = map[string]func(*map[string][]string, *xgboostFiller) error{ +var xgbTrainAttrSetterMap = map[string]func(*map[string][]string, *antXGBoostFiller) error{ // booster params - "train.objective": strPartial("train.objective", func(r *xgboostFiller) *string { return &(r.Objective) }), - "train.eval_metric": strPartial("train.eval_metric", func(r *xgboostFiller) *string { return &(r.EvalMetric) }), - "train.booster": strPartial("train.booster", func(r *xgboostFiller) *string { return &(r.Booster) }), - "train.seed": uIntPartial("train.seed", func(r *xgboostFiller) *uint { return &(r.Seed) }), - "train.num_class": uIntPartial("train.num_class", func(r *xgboostFiller) *uint { return &(r.NumClass) }), - "train.eta": fp32Partial("train.eta", func(r *xgboostFiller) *float32 { return &(r.Eta) }), - "train.gamma": fp32Partial("train.gamma", func(r *xgboostFiller) *float32 { return &(r.Gamma) }), - "train.max_depth": uIntPartial("train.max_depth", func(r *xgboostFiller) *uint { return &(r.MaxDepth) }), - "train.min_child_weight": uIntPartial("train.min_child_weight", func(r *xgboostFiller) *uint { return &(r.MinChildWeight) }), - "train.subsample": fp32Partial("train.subsample", func(r *xgboostFiller) *float32 { return &(r.Subsample) }), - "train.colsample_bytree": fp32Partial("train.colsample_bytree", func(r *xgboostFiller) *float32 { return &(r.ColSampleByTree) }), - "train.colsample_bylevel": fp32Partial("train.colsample_bylevel", func(r *xgboostFiller) *float32 { return &(r.ColSampleByLevel) }), - "train.colsample_bynode": fp32Partial("train.colsample_bynode", func(r *xgboostFiller) *float32 { return &(r.ColSampleByNode) }), - "train.lambda": fp32Partial("train.lambda", func(r *xgboostFiller) *float32 { return &(r.Lambda) }), - "train.alpha": fp32Partial("train.alpha", func(r *xgboostFiller) *float32 { return &(r.Alpha) }), - "train.tree_method": strPartial("train.tree_method", func(r *xgboostFiller) *string { return &(r.TreeMethod) }), - "train.sketch_eps": fp32Partial("train.sketch_eps", func(r *xgboostFiller) *float32 { return &(r.SketchEps) }), - "train.scale_pos_weight": fp32Partial("train.scale_pos_weight", func(r *xgboostFiller) *float32 { return &(r.ScalePosWeight) }), - "train.grow_policy": strPartial("train.grow_policy", func(r *xgboostFiller) *string { return &(r.GrowPolicy) }), - "train.max_leaves": uIntPartial("train.max_leaves", func(r *xgboostFiller) *uint { return &(r.MaxLeaves) }), - "train.max_bin": uIntPartial("train.max_bin", func(r *xgboostFiller) *uint { return &(r.MaxBin) }), - "train.num_parallel_tree": uIntPartial("train.num_parallel_tree", func(r *xgboostFiller) *uint { return &(r.NumParallelTree) }), - "train.convergence_criteria": strPartial("train.convergence_criteria", func(r *xgboostFiller) *string { return &(r.ConvergenceCriteria) }), - "train.verbosity": uIntPartial("train.verbosity", func(r *xgboostFiller) *uint { return &(r.Verbosity) }), + "train.objective": strPartial("train.objective", func(r *antXGBoostFiller) *string { return &(r.Objective) }), + "train.eval_metric": strPartial("train.eval_metric", func(r *antXGBoostFiller) *string { return &(r.EvalMetric) }), + "train.booster": strPartial("train.booster", func(r *antXGBoostFiller) *string { return &(r.Booster) }), + "train.seed": uIntPartial("train.seed", func(r *antXGBoostFiller) *uint { return &(r.Seed) }), + "train.num_class": uIntPartial("train.num_class", func(r *antXGBoostFiller) *uint { return &(r.NumClass) }), + "train.eta": fp32Partial("train.eta", func(r *antXGBoostFiller) *float32 { return &(r.Eta) }), + "train.gamma": fp32Partial("train.gamma", func(r *antXGBoostFiller) *float32 { return &(r.Gamma) }), + "train.max_depth": uIntPartial("train.max_depth", func(r *antXGBoostFiller) *uint { return &(r.MaxDepth) }), + "train.min_child_weight": uIntPartial("train.min_child_weight", func(r *antXGBoostFiller) *uint { return &(r.MinChildWeight) }), + "train.subsample": fp32Partial("train.subsample", func(r *antXGBoostFiller) *float32 { return &(r.Subsample) }), + "train.colsample_bytree": fp32Partial("train.colsample_bytree", func(r *antXGBoostFiller) *float32 { return &(r.ColSampleByTree) }), + "train.colsample_bylevel": fp32Partial("train.colsample_bylevel", func(r *antXGBoostFiller) *float32 { return &(r.ColSampleByLevel) }), + "train.colsample_bynode": fp32Partial("train.colsample_bynode", func(r *antXGBoostFiller) *float32 { return &(r.ColSampleByNode) }), + "train.lambda": fp32Partial("train.lambda", func(r *antXGBoostFiller) *float32 { return &(r.Lambda) }), + "train.alpha": fp32Partial("train.alpha", func(r *antXGBoostFiller) *float32 { return &(r.Alpha) }), + "train.tree_method": strPartial("train.tree_method", func(r *antXGBoostFiller) *string { return &(r.TreeMethod) }), + "train.sketch_eps": fp32Partial("train.sketch_eps", func(r *antXGBoostFiller) *float32 { return &(r.SketchEps) }), + "train.scale_pos_weight": fp32Partial("train.scale_pos_weight", func(r *antXGBoostFiller) *float32 { return &(r.ScalePosWeight) }), + "train.grow_policy": strPartial("train.grow_policy", func(r *antXGBoostFiller) *string { return &(r.GrowPolicy) }), + "train.max_leaves": uIntPartial("train.max_leaves", func(r *antXGBoostFiller) *uint { return &(r.MaxLeaves) }), + "train.max_bin": uIntPartial("train.max_bin", func(r *antXGBoostFiller) *uint { return &(r.MaxBin) }), + "train.num_parallel_tree": uIntPartial("train.num_parallel_tree", func(r *antXGBoostFiller) *uint { return &(r.NumParallelTree) }), + "train.convergence_criteria": strPartial("train.convergence_criteria", func(r *antXGBoostFiller) *string { return &(r.ConvergenceCriteria) }), + "train.verbosity": uIntPartial("train.verbosity", func(r *antXGBoostFiller) *uint { return &(r.Verbosity) }), // xgboost train controllers - "train.num_round": uIntPartial("train.num_round", func(r *xgboostFiller) *uint { return &(r.NumRound) }), - "train.auto_train": boolPartial("train.auto_train", func(r *xgboostFiller) *bool { return &(r.AutoTrain) }), + "train.num_round": uIntPartial("train.num_round", func(r *antXGBoostFiller) *uint { return &(r.NumRound) }), + "train.auto_train": boolPartial("train.auto_train", func(r *antXGBoostFiller) *bool { return &(r.AutoTrain) }), // Label, Group, Weight and xgFeatureFields are parsed from columnClause } -var xgbPredAttrSetterMap = map[string]func(*map[string][]string, *xgboostFiller) error{ +var xgbPredAttrSetterMap = map[string]func(*map[string][]string, *antXGBoostFiller) error{ // xgboost output columns (for prediction) - "pred.append_columns": sListPartial("pred.append_columns", func(r *xgboostFiller) *[]string { return &(r.AppendColumns) }), - "pred.prob_column": strPartial("pred.prob_column", func(r *xgboostFiller) *string { return &(r.ProbColumn) }), - "pred.detail_column": strPartial("pred.detail_column", func(r *xgboostFiller) *string { return &(r.DetailColumn) }), - "pred.encoding_column": strPartial("pred.encoding_column", func(r *xgboostFiller) *string { return &(r.EncodingColumn) }), + "pred.append_columns": sListPartial("pred.append_columns", func(r *antXGBoostFiller) *[]string { return &(r.AppendColumns) }), + "pred.prob_column": strPartial("pred.prob_column", func(r *antXGBoostFiller) *string { return &(r.ProbColumn) }), + "pred.detail_column": strPartial("pred.detail_column", func(r *antXGBoostFiller) *string { return &(r.DetailColumn) }), + "pred.encoding_column": strPartial("pred.encoding_column", func(r *antXGBoostFiller) *string { return &(r.EncodingColumn) }), // Label, Group, Weight and xgFeatureFields are parsed from columnClause } -func xgParseAttr(pr *extendedSelect, r *xgboostFiller) error { +func xgParseAttr(pr *extendedSelect, r *antXGBoostFiller) error { var rawAttrs map[string]*expr if pr.train { rawAttrs = pr.trainAttrs @@ -324,8 +324,8 @@ func xgParseAttr(pr *extendedSelect, r *xgboostFiller) error { } } - // fill xgboostFiller with attrs - var setterMap map[string]func(*map[string][]string, *xgboostFiller) error + // fill antXGBoostFiller with attrs + var setterMap map[string]func(*map[string][]string, *antXGBoostFiller) error if pr.train { setterMap = xgbTrainAttrSetterMap } else { @@ -358,7 +358,7 @@ func xgParseAttr(pr *extendedSelect, r *xgboostFiller) error { // data example: COLUMN SPARSE("0:1.5 1:100.1f 11:-1.2", [20], " ") // 2. tf feature columns // Roughly same as TFEstimator, except output shape of feaColumns are required to be 1-dim. -func parseFeatureColumns(columns *exprlist, r *xgboostFiller) error { +func parseFeatureColumns(columns *exprlist, r *antXGBoostFiller) error { feaCols, colSpecs, err := resolveTrainColumns(columns) if err != nil { return err @@ -379,7 +379,7 @@ func parseFeatureColumns(columns *exprlist, r *xgboostFiller) error { // parseSparseKeyValueFeatures, parse features which is identified by `SPARSE`. // ex: SPARSE(col1, [100], comma) -func parseSparseKeyValueFeatures(colSpecs []*columnSpec, r *xgboostFiller) error { +func parseSparseKeyValueFeatures(colSpecs []*columnSpec, r *antXGBoostFiller) error { var colNames []string for _, spec := range colSpecs { colNames = append(colNames, spec.ColumnName) @@ -425,7 +425,7 @@ func isSimpleColumn(col featureColumn) bool { return false } -func parseDenseFeatures(feaCols []featureColumn, r *xgboostFiller) error { +func parseDenseFeatures(feaCols []featureColumn, r *antXGBoostFiller) error { allSimpleCol := true for _, col := range feaCols { if allSimpleCol && !isSimpleColumn(col) { @@ -511,7 +511,7 @@ func parseSimpleColumn(field string, columns *exprlist) (*xgFeatureMeta, error) return fm, nil } -func xgParseColumns(pr *extendedSelect, filler *xgboostFiller) error { +func xgParseColumns(pr *extendedSelect, filler *antXGBoostFiller) error { for target, columns := range pr.columns { switch target { case "feature_columns": @@ -553,7 +553,7 @@ func xgParseColumns(pr *extendedSelect, filler *xgboostFiller) error { return nil } -func xgParseEstimator(pr *extendedSelect, filler *xgboostFiller) error { +func xgParseEstimator(pr *extendedSelect, filler *antXGBoostFiller) error { switch strings.ToUpper(pr.estimator) { case "XGBOOST.ESTIMATOR": if len(filler.Objective) == 0 { @@ -590,8 +590,8 @@ func xgParseEstimator(pr *extendedSelect, filler *xgboostFiller) error { return nil } -func newXGBoostFiller(pr *extendedSelect, ds *trainAndValDataset, db *DB) (*xgboostFiller, error) { - filler := &xgboostFiller{ +func newAntXGBoostFiller(pr *extendedSelect, ds *trainAndValDataset, db *DB) (*antXGBoostFiller, error) { + filler := &antXGBoostFiller{ ModelPath: pr.save, } filler.IsTrain = pr.train @@ -720,7 +720,7 @@ func xgFillDatabaseInfo(r *xgDataSourceFields, db *DB) error { return nil } -func xgCreatePredictionTable(pr *extendedSelect, r *xgboostFiller, db *DB) error { +func xgCreatePredictionTable(pr *extendedSelect, r *antXGBoostFiller, db *DB) error { dropStmt := fmt.Sprintf("drop table if exists %s;", r.OutputTable) if _, e := db.Exec(dropStmt); e != nil { return fmt.Errorf("failed executing %s: %q", dropStmt, e) @@ -791,7 +791,7 @@ func xgCreatePredictionTable(pr *extendedSelect, r *xgboostFiller, db *DB) error } func genXG(w io.Writer, pr *extendedSelect, ds *trainAndValDataset, fts fieldTypes, db *DB) error { - r, e := newXGBoostFiller(pr, ds, db) + r, e := newAntXGBoostFiller(pr, ds, db) if e != nil { return e } @@ -808,7 +808,7 @@ var xgTemplate = template.Must(template.New("codegenXG").Parse(xgTemplateText)) const xgTemplateText = ` from launcher.config_fields import JobType -from sqlflow_submitter.xgboost import run_with_sqlflow +from sqlflow_submitter.ant_xgboost import run_with_sqlflow {{if .IsTrain}} mode = JobType.TRAIN diff --git a/sql/codegen_xgboost_test.go b/sql/codegen_ant_xgboost_test.go similarity index 92% rename from sql/codegen_xgboost_test.go rename to sql/codegen_ant_xgboost_test.go index 34e8bc4033..665de64e55 100644 --- a/sql/codegen_xgboost_test.go +++ b/sql/codegen_ant_xgboost_test.go @@ -22,7 +22,7 @@ import ( ) const ( - testXGTrainSelectIris = ` + testAntXGTrainSelectIris = ` SELECT * FROM iris.train TRAIN xgboost.Estimator @@ -36,13 +36,13 @@ WITH COLUMN sepal_length, sepal_width, petal_length, petal_width LABEL class INTO sqlflow_models.my_xgboost_model; ` - testXGAnalyzeSelectIris = ` + testAntXGAnalyzeSelectIris = ` SELECT * FROM iris.train ANALYZE sqlflow_models.my_xgboost_model USING TreeExplainer; ` - testXGPredSelectIris = ` + testAntXGPredSelectIris = ` SELECT * FROM iris.test PREDICT iris.predict.result @@ -57,10 +57,10 @@ USING sqlflow_models.my_xgboost_model; func TestPartials(t *testing.T) { a := assert.New(t) tmpMap := make(map[string][]string) - filler := &xgboostFiller{} + filler := &antXGBoostFiller{} // test strPartial - part := strPartial("obj", func(r *xgboostFiller) *string { return &(r.Objective) }) + part := strPartial("obj", func(r *antXGBoostFiller) *string { return &(r.Objective) }) tmpMap["obj"] = []string{"binary:logistic"} e := part(&tmpMap, filler) a.NoError(e) @@ -83,7 +83,7 @@ func TestPartials(t *testing.T) { a.Equal(filler.Objective, "reg:squarederror") // test uIntPartial - part = uIntPartial("num_class", func(r *xgboostFiller) *uint { return &(r.NumClass) }) + part = uIntPartial("num_class", func(r *antXGBoostFiller) *uint { return &(r.NumClass) }) tmpMap["num_class"] = []string{"3"} e = part(&tmpMap, filler) a.NoError(e) @@ -92,7 +92,7 @@ func TestPartials(t *testing.T) { a.Equal(ok, false) // test fp32Partial - part = fp32Partial("eta", func(r *xgboostFiller) *float32 { return &(r.Eta) }) + part = fp32Partial("eta", func(r *antXGBoostFiller) *float32 { return &(r.Eta) }) tmpMap["eta"] = []string{"-0.33"} e = part(&tmpMap, filler) a.NoError(e) @@ -101,7 +101,7 @@ func TestPartials(t *testing.T) { a.Equal(ok, false) // test boolPartial - part = boolPartial("auto_train", func(r *xgboostFiller) *bool { return &(r.AutoTrain) }) + part = boolPartial("auto_train", func(r *antXGBoostFiller) *bool { return &(r.AutoTrain) }) tmpMap["auto_train"] = []string{"false"} e = part(&tmpMap, filler) a.NoError(e) @@ -114,7 +114,7 @@ func TestPartials(t *testing.T) { a.Equal(filler.AutoTrain, true) // test sListPartial - part = sListPartial("append_columns", func(r *xgboostFiller) *[]string { return &(r.AppendColumns) }) + part = sListPartial("append_columns", func(r *antXGBoostFiller) *[]string { return &(r.AppendColumns) }) tmpMap["append_columns"] = []string{"AA", "BB", "CC"} e = part(&tmpMap, filler) a.NoError(e) @@ -131,8 +131,8 @@ func TestXGBoostAttr(t *testing.T) { } parser := newParser() - parseAndFill := func(clause string) *xgboostFiller { - filler := &xgboostFiller{} + parseAndFill := func(clause string) *antXGBoostFiller { + filler := &antXGBoostFiller{} r, e := parser.Parse(clause) a.NoError(e) e = xgParseAttr(r, filler) @@ -235,7 +235,7 @@ WITH attr_x = XXX LABEL e INTO model_table; ` // test sparseKV schema - filler := &xgboostFiller{} + filler := &antXGBoostFiller{} sparseKVSpec := ` COLUMN SPARSE(a, 100, comma) ` r, e := parser.Parse(sqlHead + sparseKVSpec + sqlTail) a.NoError(e) @@ -256,7 +256,7 @@ LABEL e INTO model_table; a.EqualValues("e", filler.Label) // test raw columns - filler = &xgboostFiller{} + filler = &antXGBoostFiller{} rawColumnsSpec := " COLUMN a, b, b, c, d, c " r, _ = parser.Parse(sqlHead + rawColumnsSpec + sqlTail) e = xgParseColumns(r, filler) @@ -277,7 +277,7 @@ LABEL e INTO model_table; } // test tf.feature_columns - filler = &xgboostFiller{} + filler = &antXGBoostFiller{} fcSpec := " COLUMN a, b, c, EMBEDDING(CATEGORY_ID(d, 2000), 8, mean) FOR feature_columns " r, _ = parser.Parse(sqlHead + fcSpec + sqlTail) e = xgParseColumns(r, filler) @@ -288,7 +288,7 @@ LABEL e INTO model_table; a.True(filler.IsTensorFlowIntegrated) // test group & weight - filler = &xgboostFiller{} + filler = &antXGBoostFiller{} groupWeightSpec := " COLUMN gg FOR group COLUMN ww FOR weight " r, _ = parser.Parse(sqlHead + fcSpec + groupWeightSpec + sqlTail) e = xgParseColumns(r, filler) @@ -299,7 +299,7 @@ LABEL e INTO model_table; a.EqualValues("ww", filler.Weight) // test xgMixSchemaError - filler = &xgboostFiller{} + filler = &antXGBoostFiller{} wrongColSpec := " COLUMN SPARSE(a, 2000, comma), b, c, d " r, _ = parser.Parse(sqlHead + wrongColSpec + sqlTail) e = xgParseColumns(r, filler) @@ -307,7 +307,7 @@ LABEL e INTO model_table; a.EqualValues(e, xgParseColumnError("feature_columns", xgMixSchemaError())) // test `DENSE` keyword - filler = &xgboostFiller{} + filler = &antXGBoostFiller{} wrongColSpec = " COLUMN DENSE(b, 5, comma) " r, _ = parser.Parse(sqlHead + wrongColSpec + sqlTail) e = xgParseColumns(r, filler) @@ -315,7 +315,7 @@ LABEL e INTO model_table; a.EqualValues(e, xgParseColumnError("feature_columns", xgUnknownFCError("DENSE"))) // test xgMultiSparseError - filler = &xgboostFiller{} + filler = &antXGBoostFiller{} wrongColSpec = " COLUMN SPARSE(a, 2000, comma), SPARSE(b, 100, comma) " r, _ = parser.Parse(sqlHead + wrongColSpec + sqlTail) e = xgParseColumns(r, filler) @@ -323,7 +323,7 @@ LABEL e INTO model_table; a.EqualValues(e, xgParseColumnError("feature_columns", xgMultiSparseError([]string{"a", "b"}))) // test xgUnsupportedColTagError - filler = &xgboostFiller{} + filler = &antXGBoostFiller{} unsupportedSpec := " COLUMN gg FOR group COLUMN ww FOR xxxxx " r, _ = parser.Parse(sqlHead + fcSpec + unsupportedSpec + sqlTail) e = xgParseColumns(r, filler) @@ -350,7 +350,7 @@ LABEL e INTO model_table; ` pr, e := parser.Parse(trainClause) a.NoError(e) - filler, e := newXGBoostFiller(pr, nil, testDB) + filler, e := newAntXGBoostFiller(pr, nil, testDB) a.NoError(e) a.True(filler.IsTrain) @@ -396,7 +396,7 @@ LABEL e INTO model_table; // test with trainAndValDataset ds := &trainAndValDataset{training: "TrainTable", validation: "EvalTable"} - filler, e = newXGBoostFiller(pr, ds, testDB) + filler, e = newAntXGBoostFiller(pr, ds, testDB) a.NoError(e) trainSlct := strings.TrimSuffix(strings.Replace(filler.StandardSelect, "\n", " ", -1), ";") a.EqualValues("SELECT * FROM TrainTable", trainSlct) @@ -413,7 +413,7 @@ LABEL e INTO model_table; pr, e = parser.Parse(testPredictSelectIris) a.NoError(e) - filler, e = newXGBoostFiller(pr, nil, testDB) + filler, e = newAntXGBoostFiller(pr, nil, testDB) a.NoError(e) a.Equal("class", filler.ResultColumn) a.Equal("iris.predict", filler.OutputTable) diff --git a/sql/executor.go b/sql/executor.go index 3a098aa2f3..17187c56dd 100644 --- a/sql/executor.go +++ b/sql/executor.go @@ -373,7 +373,7 @@ func buildFiller(es *extendedSelect, ds *trainAndValDataset, fts fieldTypes, db dataset = ds } if strings.HasPrefix(strings.ToUpper(es.estimator), `XGBOOST.`) { - return newXGBoostFiller(es, dataset, db) + return newAntXGBoostFiller(es, dataset, db) } return newFiller(es, dataset, fts, db) } @@ -386,7 +386,7 @@ func train(wr *PipeWriter, tr *extendedSelect, db *DB, cwd string, modelDir stri var program bytes.Buffer if strings.HasPrefix(strings.ToUpper(tr.estimator), `XGBOOST.`) { - // TODO(sperlingxx): write a separate train pipeline for xgboost to support remote mode + // TODO(sperlingxx): write a separate train pipeline for ant-xgboost to support remote mode if e := genXG(&program, tr, ds, fts, db); e != nil { return fmt.Errorf("genXG %v", e) } @@ -452,7 +452,7 @@ func pred(wr *PipeWriter, pr *extendedSelect, db *DB, cwd string, modelDir strin var buf bytes.Buffer if strings.HasPrefix(strings.ToUpper(pr.estimator), `XGBOOST.`) { - // TODO(sperlingxx): write a separate pred pipeline for xgboost to support remote mode + // TODO(sperlingxx): write a separate pred pipeline for ant-xgboost to support remote mode if e := genXG(&buf, pr, nil, fts, db); e != nil { return fmt.Errorf("genXG %v", e) } diff --git a/sql/executor_test.go b/sql/executor_test.go index 82f633ec4a..bf65b9d548 100644 --- a/sql/executor_test.go +++ b/sql/executor_test.go @@ -71,19 +71,19 @@ func TestSplitExtendedSQL(t *testing.T) { a.Equal(`train a with b;`, s[0]) } -func TestExecutorTrainAnalyzePredictXGBoost(t *testing.T) { +func TestExecutorTrainAnalyzePredictAntXGBoost(t *testing.T) { a := assert.New(t) modelDir, e := ioutil.TempDir("/tmp", "sqlflow_models") a.Nil(e) defer os.RemoveAll(modelDir) a.NotPanics(func() { - stream := runExtendedSQL(testXGTrainSelectIris, testDB, modelDir, nil) + stream := runExtendedSQL(testAntXGTrainSelectIris, testDB, modelDir, nil) a.True(goodStream(stream.ReadAll())) - stream = runExtendedSQL(testXGAnalyzeSelectIris, testDB, modelDir, nil) + stream = runExtendedSQL(testAntXGAnalyzeSelectIris, testDB, modelDir, nil) a.True(goodStream(stream.ReadAll())) - stream = runExtendedSQL(testXGPredSelectIris, testDB, modelDir, nil) + stream = runExtendedSQL(testAntXGPredSelectIris, testDB, modelDir, nil) a.True(goodStream(stream.ReadAll())) }) } diff --git a/sql/python/sqlflow_submitter/xgboost/__init__.py b/sql/python/sqlflow_submitter/ant_xgboost/__init__.py similarity index 86% rename from sql/python/sqlflow_submitter/xgboost/__init__.py rename to sql/python/sqlflow_submitter/ant_xgboost/__init__.py index 47dd54a5e7..0123cb5d67 100644 --- a/sql/python/sqlflow_submitter/xgboost/__init__.py +++ b/sql/python/sqlflow_submitter/ant_xgboost/__init__.py @@ -12,7 +12,7 @@ # limitations under the License. from .sqlflow_data_source import SQLFlowDataSource, SQLFlowDSConfig -from .sqlflow_xgboost_main import run_with_sqlflow -from .common import XGBoostError +from .sqlflow_ant_xgboost_main import run_with_sqlflow +from .common import AntXGBoostError -__all__ = ['run_with_sqlflow', 'SQLFlowDataSource', 'SQLFlowDSConfig', 'XGBoostError'] +__all__ = ['run_with_sqlflow', 'SQLFlowDataSource', 'SQLFlowDSConfig', 'AntXGBoostError'] diff --git a/sql/python/sqlflow_submitter/xgboost/common.py b/sql/python/sqlflow_submitter/ant_xgboost/common.py similarity index 94% rename from sql/python/sqlflow_submitter/xgboost/common.py rename to sql/python/sqlflow_submitter/ant_xgboost/common.py index 098f183836..1d189a4520 100644 --- a/sql/python/sqlflow_submitter/xgboost/common.py +++ b/sql/python/sqlflow_submitter/ant_xgboost/common.py @@ -12,5 +12,5 @@ # limitations under the License. -class XGBoostError(Exception): +class AntXGBoostError(Exception): pass diff --git a/sql/python/sqlflow_submitter/xgboost/sqlflow_xgboost_main.py b/sql/python/sqlflow_submitter/ant_xgboost/sqlflow_ant_xgboost_main.py similarity index 88% rename from sql/python/sqlflow_submitter/xgboost/sqlflow_xgboost_main.py rename to sql/python/sqlflow_submitter/ant_xgboost/sqlflow_ant_xgboost_main.py index e6a0c8bf38..e50f9f8f15 100644 --- a/sql/python/sqlflow_submitter/xgboost/sqlflow_xgboost_main.py +++ b/sql/python/sqlflow_submitter/ant_xgboost/sqlflow_ant_xgboost_main.py @@ -16,8 +16,8 @@ from launcher import register_data_source, config_helper, config_fields as cf, train, predict -from sqlflow_submitter.xgboost.common import XGBoostError -from sqlflow_submitter.xgboost.sqlflow_data_source import SQLFlowDSConfig, SQLFlowDataSource +from sqlflow_submitter.ant_xgboost.common import AntXGBoostError +from sqlflow_submitter.ant_xgboost.sqlflow_data_source import SQLFlowDSConfig, SQLFlowDataSource register_data_source('sqlflow', SQLFlowDSConfig, SQLFlowDataSource) @@ -29,7 +29,7 @@ def run_with_sqlflow(mode: str, column_config: str, valid_data_source_config: str = None): if mode not in (cf.JobType.TRAIN, cf.JobType.PREDICT): - raise XGBoostError('Unknown run mode(%s) of xgboost launcher.' % mode) + raise AntXGBoostError('Unknown run mode(%s) of ant-xgboost launcher.' % mode) is_train = mode == cf.JobType.TRAIN def parse_json_str(string: str): @@ -71,10 +71,10 @@ def parse_json_str(string: str): train_fields = cf.TrainFields(learning_fields, data_fields, model_fields) train(train_fields) except Exception as e: - raise XGBoostError('XGBoost training task failed: %s' % e) + raise AntXGBoostError('XGBoost training task failed: %s' % e) else: try: pred_fields = cf.PredictFields(data_fields, model_fields) predict(pred_fields) except Exception as e: - raise XGBoostError('XGBoost prediction task failed: %s' % e) + raise AntXGBoostError('XGBoost prediction task failed: %s' % e) diff --git a/sql/python/sqlflow_submitter/xgboost/sqlflow_data_source.py b/sql/python/sqlflow_submitter/ant_xgboost/sqlflow_data_source.py similarity index 95% rename from sql/python/sqlflow_submitter/xgboost/sqlflow_data_source.py rename to sql/python/sqlflow_submitter/ant_xgboost/sqlflow_data_source.py index 145681caaf..c997572704 100644 --- a/sql/python/sqlflow_submitter/xgboost/sqlflow_data_source.py +++ b/sql/python/sqlflow_submitter/ant_xgboost/sqlflow_data_source.py @@ -19,7 +19,7 @@ from launcher import DataSource, config_fields, XGBoostResult, XGBoostRecord from launcher.data_units import RecordBuilder -from .common import XGBoostError +from .common import AntXGBoostError from ..db import connect, db_generator, buffered_db_writer @@ -40,7 +40,7 @@ def convert_shape(cls, value) -> typing.List: elif isinstance(value, typing.List): return value else: - raise XGBoostError('invalid shape %s of FeatureMeta' % value) + raise AntXGBoostError('invalid shape %s of FeatureMeta' % value) class SQLFlowDSConfig(typing.NamedTuple): @@ -63,11 +63,11 @@ def __init__(self, rank: int, num_worker: int, source_conf): super().__init__(rank, num_worker, column_conf, source_conf) if not isinstance(source_conf, SQLFlowDSConfig): - raise XGBoostError("SQLFlowDataSource: invalid source conf") + raise AntXGBoostError("SQLFlowDataSource: invalid source conf") # TODO: support tf.feature_column transformation if source_conf.is_tf_integrated: - raise XGBoostError('So far, tf transformation is not supported in xgboost job.') + raise AntXGBoostError('So far, tf transformation is not supported in ant-xgboost job.') self._train = source_conf.is_train self._rcd_builder = RecordBuilder(column_conf.features) @@ -144,7 +144,7 @@ def writer_maker(table_schema): if not self._train: if not source_conf.output_table: - raise XGBoostError('Output_table must be defined in xgboost prediction job.') + raise AntXGBoostError('Output_table must be defined in xgboost prediction job.') def _read_impl(self): label = None diff --git a/sql/python/sqlflow_submitter/xgboost/sqlflow_data_source_test.py b/sql/python/sqlflow_submitter/ant_xgboost/sqlflow_data_source_test.py similarity index 98% rename from sql/python/sqlflow_submitter/xgboost/sqlflow_data_source_test.py rename to sql/python/sqlflow_submitter/ant_xgboost/sqlflow_data_source_test.py index 3af49296a2..fb6214c701 100644 --- a/sql/python/sqlflow_submitter/xgboost/sqlflow_data_source_test.py +++ b/sql/python/sqlflow_submitter/ant_xgboost/sqlflow_data_source_test.py @@ -15,7 +15,7 @@ import os from unittest import TestCase -from sqlflow_submitter.xgboost.sqlflow_data_source import SQLFlowDSConfig, SQLFlowDataSource +from sqlflow_submitter.ant_xgboost.sqlflow_data_source import SQLFlowDSConfig, SQLFlowDataSource from launcher import config_helper, config_fields, register_data_source, XGBoostRecord, XGBoostResult from launcher.data_source import create_data_source_init_fn from sqlflow_submitter.db_test import execute as db_exec diff --git a/sql/python/test_magic_xgboost.py b/sql/python/test_magic_ant_xgboost.py similarity index 98% rename from sql/python/test_magic_xgboost.py rename to sql/python/test_magic_ant_xgboost.py index b03a0def98..220a9726e0 100644 --- a/sql/python/test_magic_xgboost.py +++ b/sql/python/test_magic_ant_xgboost.py @@ -46,7 +46,7 @@ class TestSQLFlowMagic(unittest.TestCase): USING sqlflow_models.my_xgboost_model; """ - def test_xgboost(self): + def test_antxgboost(self): ipython.run_cell_magic("sqlflow", "", self.train_statement) ipython.run_cell_magic("sqlflow", "", self.pred_statement)