Skip to content

Commit 09a110c

Browse files
tonyyang-svailwangkuiyi
authored andcommitted
pass all tests in pkg/sql (#1014)
1 parent fc728f1 commit 09a110c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+262
-237
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ Here are examples for training a Tensorflow [DNNClassifer](https://www.tensorflo
3030
```sql
3131
sqlflow> SELECT *
3232
FROM iris.train
33-
TRAIN DNNClassifier
33+
TO TRAIN DNNClassifier
3434
WITH model.n_classes = 3, model.hidden_units = [10, 20]
3535
COLUMN sepal_length, sepal_width, petal_length, petal_width
3636
LABEL class

cmd/sqlflowserver/main_test.go

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,7 @@ func CaseTrainSQL(t *testing.T) {
529529
a := assert.New(t)
530530
trainSQL := fmt.Sprintf(`SELECT *
531531
FROM %s.%s
532-
TRAIN DNNClassifier
532+
TO TRAIN DNNClassifier
533533
WITH model.n_classes = 3, model.hidden_units = [10, 20]
534534
COLUMN sepal_length, sepal_width, petal_length, petal_width
535535
LABEL class
@@ -587,7 +587,7 @@ func CaseTrainCustomModel(t *testing.T) {
587587
a := assert.New(t)
588588
trainSQL := `SELECT *
589589
FROM iris.train
590-
TRAIN sqlflow_models.DNNClassifier
590+
TO TRAIN sqlflow_models.DNNClassifier
591591
WITH model.n_classes = 3, model.hidden_units = [10, 20]
592592
COLUMN sepal_length, sepal_width, petal_length, petal_width
593593
LABEL class
@@ -643,7 +643,7 @@ func CaseTrainTextClassification(t *testing.T) {
643643
a := assert.New(t)
644644
trainSQL := `SELECT *
645645
FROM text_cn.train_processed
646-
TRAIN DNNClassifier
646+
TO TRAIN DNNClassifier
647647
WITH model.n_classes = 17, model.hidden_units = [10, 20]
648648
COLUMN EMBEDDING(CATEGORY_ID(news_title,16000,COMMA),128,mean)
649649
LABEL class_id
@@ -671,7 +671,7 @@ func CaseTrainTextClassificationCustomLSTM(t *testing.T) {
671671
a := assert.New(t)
672672
trainSQL := `SELECT *
673673
FROM text_cn.train_processed
674-
TRAIN sqlflow_models.StackedBiLSTMClassifier
674+
TO TRAIN sqlflow_models.StackedBiLSTMClassifier
675675
WITH model.n_classes = 17, model.stack_units = [16], train.epoch = 1, train.batch_size = 32
676676
COLUMN EMBEDDING(SEQ_CATEGORY_ID(news_title,1600,COMMA),128,mean)
677677
LABEL class_id
@@ -697,7 +697,7 @@ func CaseTrainSQLWithHyperParams(t *testing.T) {
697697
a := assert.New(t)
698698
trainSQL := `SELECT *
699699
FROM iris.train
700-
TRAIN DNNClassifier
700+
TO TRAIN DNNClassifier
701701
WITH model.n_classes = 3, model.hidden_units = [10, 20], train.batch_size = 10, train.epoch = 2
702702
COLUMN sepal_length, sepal_width, petal_length, petal_width
703703
LABEL class
@@ -723,7 +723,7 @@ func CaseTrainDeepWideModel(t *testing.T) {
723723
a := assert.New(t)
724724
trainSQL := `SELECT *
725725
FROM iris.train
726-
TRAIN DNNLinearCombinedClassifier
726+
TO TRAIN DNNLinearCombinedClassifier
727727
WITH model.n_classes = 3, model.dnn_hidden_units = [10, 20], train.batch_size = 10, train.epoch = 2
728728
COLUMN sepal_length, sepal_width FOR linear_feature_columns
729729
COLUMN petal_length, petal_width FOR dnn_feature_columns
@@ -751,7 +751,7 @@ func CaseTrainCustomModelWithHyperParams(t *testing.T) {
751751
a := assert.New(t)
752752
trainSQL := `SELECT *
753753
FROM iris.train
754-
TRAIN sqlflow_models.DNNClassifier
754+
TO TRAIN sqlflow_models.DNNClassifier
755755
WITH model.n_classes = 3, model.hidden_units = [10, 20], train.batch_size = 10, train.epoch=2
756756
COLUMN sepal_length, sepal_width, petal_length, petal_width
757757
LABEL class
@@ -777,7 +777,7 @@ func CaseSparseFeature(t *testing.T) {
777777
a := assert.New(t)
778778
trainSQL := `SELECT *
779779
FROM text_cn.train
780-
TRAIN DNNClassifier
780+
TO TRAIN DNNClassifier
781781
WITH model.n_classes = 3, model.hidden_units = [10, 20]
782782
COLUMN EMBEDDING(CATEGORY_ID(news_title,16000,COMMA),128,mean)
783783
LABEL class_id
@@ -804,7 +804,7 @@ func CaseTrainElasticDL(t *testing.T) {
804804
a := assert.New(t)
805805
trainSQL := fmt.Sprintf(`SELECT sepal_length, sepal_width, petal_length, petal_width, class
806806
FROM %s.%s
807-
TRAIN ElasticDLDNNClassifier
807+
TO TRAIN ElasticDLDNNClassifier
808808
WITH
809809
model.optimizer = "optimizer",
810810
model.loss = "loss",
@@ -866,7 +866,7 @@ func CaseTrainALPS(t *testing.T) {
866866
trainSQL := fmt.Sprintf(`SELECT deep_id, user_space_stat, user_behavior_stat, space_stat, l
867867
FROM %s.sparse_column_test
868868
LIMIT 100
869-
TRAIN DNNClassifier
869+
TO TRAIN DNNClassifier
870870
WITH model.n_classes = 2, model.hidden_units = [10, 20], train.batch_size = 10, engine.ps_num=0, engine.worker_num=0, engine.type=local
871871
COLUMN SPARSE(deep_id,15033,COMMA,int),
872872
SPARSE(user_space_stat,310,COMMA,int),
@@ -901,7 +901,7 @@ func CaseTrainALPSRemoteModel(t *testing.T) {
901901
trainSQL := fmt.Sprintf(`SELECT deep_id, user_space_stat, user_behavior_stat, space_stat, l
902902
FROM %s.sparse_column_test
903903
LIMIT 100
904-
TRAIN models.estimator.dnn_classifier.DNNClassifier
904+
TO TRAIN models.estimator.dnn_classifier.DNNClassifier
905905
WITH
906906
model.n_classes = 2, model.hidden_units = [10, 20], train.batch_size = 10, engine.ps_num=0, engine.worker_num=0, engine.type=local,
907907
gitlab.project = "Alps/sqlflow-models",
@@ -940,7 +940,7 @@ func CaseTrainALPSFeatureMap(t *testing.T) {
940940
trainSQL := fmt.Sprintf(`SELECT dense, deep, item, test_sparse_with_fm.label
941941
FROM %s.test_sparse_with_fm
942942
LIMIT 32
943-
TRAIN alipay.SoftmaxClassifier
943+
TO TRAIN alipay.SoftmaxClassifier
944944
WITH train.max_steps = 32, eval.steps=32, train.batch_size=8, engine.ps_num=0, engine.worker_num=0, engine.type = local
945945
COLUMN DENSE(dense, none, comma),
946946
DENSE(item, 1, comma, int)
@@ -991,7 +991,7 @@ func CaseTrainRegression(t *testing.T) {
991991
a := assert.New(t)
992992
trainSQL := fmt.Sprintf(`SELECT *
993993
FROM housing.train
994-
TRAIN LinearRegressor
994+
TO TRAIN LinearRegressor
995995
WITH model.label_dimension=1
996996
COLUMN f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13
997997
LABEL target
@@ -1053,7 +1053,7 @@ func CaseTrainXGBoostRegression(t *testing.T) {
10531053
trainSQL := fmt.Sprintf(`
10541054
SELECT *
10551055
FROM housing.train
1056-
TRAIN xgboost.gbtree
1056+
TO TRAIN xgboost.gbtree
10571057
WITH
10581058
objective="reg:squarederror",
10591059
train.num_boost_round = 30

doc/design/design_alps_submitter.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ The column `c1` is dense encoded and `c2` is sparse encoded, `c3` is label colum
8989
select
9090
c1, c2, c3 as class
9191
from kaggle_credit_fraud_training_data
92-
TRAIN DNNClassifier
92+
TO TRAIN DNNClassifier
9393
WITH
9494
...
9595
COLUMN
@@ -148,7 +148,7 @@ Here is an example which do `BUCKETIZED` on `c2` then `CROSS` with `c1`.
148148
select
149149
c1, c2, c3 as class
150150
from kaggle_credit_fraud_training_data
151-
TRAIN DNNClassifier
151+
TO TRAIN DNNClassifier
152152
WITH
153153
...
154154
COLUMN
@@ -162,7 +162,7 @@ Feature Expressions except for Tensorflow Feature Column API should raise an err
162162
```sql
163163
/* Not supported */
164164
select * from kaggle_credit_fraud_training_data
165-
TRAIN DNNClassifier
165+
TO TRAIN DNNClassifier
166166
WITH
167167
...
168168
COLUMN
@@ -206,7 +206,7 @@ Let's create a DNNClassifier example, the minimum parameters of the constructor
206206
select
207207
c1, c2, c3 as class
208208
from kaggle_credit_fraud_training_data
209-
TRAIN DNNClassifier
209+
TO TRAIN DNNClassifier
210210
WITH
211211
estimator.hidden_units = [10, 20],
212212
train_spec.max_steps = 2000,
@@ -223,7 +223,7 @@ For now, we will pass the result of snippet code as `feature_columns` parameters
223223
select
224224
c1, c2, c3, c4, c5 as class
225225
from kaggle_credit_fraud_training_data
226-
TRAIN DNNLinearCombinedClassifier
226+
TO TRAIN DNNLinearCombinedClassifier
227227
WITH
228228
linear_feature_columns = [fc1, fc2]
229229
dnn_feature_columns = [fc3]
@@ -234,4 +234,4 @@ COLUMN
234234
CROSS([fc1, fc2, f3]) as fc3
235235
LABEL class
236236
...
237-
```
237+
```

doc/design/design_analyzer.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@ This design doc introduces how to support the `Analyze SQL` in SQLFlow with SHAP
88

99
## User Interface
1010

11-
Users usually use a **TRAIN SQL** to train a model and then analyze the model using an **ANALYZE SQL**, the simple pipeline like:
11+
Users usually use a **TO TRAIN SQL** to train a model and then analyze the model using an **ANALYZE SQL**, the simple pipeline like:
1212

1313
Train SQL:
1414

1515
``` sql
1616
SELECT * FROM train_table
17-
TRAIN xgboost.Estimator
17+
TO TRAIN xgboost.Estimator
1818
WITH
1919
train.objective = "reg:linear"
2020
COLUMN x

doc/design/design_ant_xgboost.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ Comparing to python API provided by `xgboost`, it is easier to build a python co
3333
### User Experience
3434

3535
In terms of sqlflow users, xgboost is an alternative `Estimator` like `TensorFlow Estimators`.
36-
Working with xgboost is quite similar to working with TensorFlow Estimators; just change `TRAIN DNNClassifier` into `TRAIN XGBoostEstimator`.
36+
Working with xgboost is quite similar to working with TensorFlow Estimators; just change `TO TRAIN DNNClassifier` into `TO TRAIN XGBoostEstimator`.
3737

3838
In addition, xgboost specific parameters can be configured in the same way as TensorFlow parameters.
3939

@@ -44,7 +44,7 @@ Below is a demo about training/predicting via xgboost :
4444
select
4545
c1, c2, c3, c4, c5 as class
4646
from kaggle_credit_fraud_training_data
47-
TRAIN XGBoostEstimator
47+
TO TRAIN XGBoostEstimator
4848
WITH
4949
booster = "gbtree"
5050
objective = "logistic:binary"

doc/design/design_clustermodel.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ The figure below demonstrates the overall workflow for cluster model training, w
2222

2323
In this scenario, we focus on the extraction of data patterns in unsupervised learning.
2424

25-
So, the user can use `TRAIN` keyword to training a model. The user can also specify the training hyper-parameters with the keyword `WITH` and determine whether to use pre-trained model by `USING`. The training and predicting syntax looks like:
25+
So, the user can use `TO TRAIN` keyword to training a model. The user can also specify the training hyper-parameters with the keyword `WITH` and determine whether to use pre-trained model by `USING`. The training and predicting syntax looks like:
2626

27-
TRAIN SQL:
27+
TO TRAIN SQL:
2828

2929
``` sql
3030
SELECT * FROM input_table
31-
TRAIN clusterModel
31+
TO TRAIN clusterModel
3232
WITH
3333
model.encode_units = [100, 7]
3434
model.n_clusters = 5

doc/design/design_database_abstraction_layer.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ SQLFlow calls Go's [standard database API](https://golang.org/pkg/database/sql/)
88

99
### Data Retrieval
1010

11-
The basic idea of SQLFlow is to extend the SELECT statement of SQL to have the TRAIN and PREDICT clauses. For more discussion, please refer to the [syntax design](/doc/design/design_syntax.md). SQLFlow translates such "extended SQL statements" into submitter programs, which forward the part from SELECT to TRAIN or PREDICT, which we call the "standard part", to the SQL engine. SQLFlow also accepts the SELECT statement without TRAIN or PREDICT clauses and would forward such "standard statements" to the engine. It is noticeable that the "standard part" or "standard statements" are not standardized. For example, various engines use different syntax for `FULL OUTER JOIN`.
11+
The basic idea of SQLFlow is to extend the SELECT statement of SQL to have the TO TRAIN and PREDICT clauses. For more discussion, please refer to the [syntax design](/doc/design/design_syntax.md). SQLFlow translates such "extended SQL statements" into submitter programs, which forward the part from SELECT to TO TRAIN or PREDICT, which we call the "standard part", to the SQL engine. SQLFlow also accepts the SELECT statement without TO TRAIN or PREDICT clauses and would forward such "standard statements" to the engine. It is noticeable that the "standard part" or "standard statements" are not standardized. For example, various engines use different syntax for `FULL OUTER JOIN`.
1212

1313
- Hive supports `FULL OUTER JOIN` directly.
1414
- MySQL doesn't have `FULL OUTER JOIN`. However, a user can emulates `FULL OUTER JOIN` using `LEFT JOIN`, `UNION` and `RIGHT JOIN`.
@@ -24,7 +24,7 @@ SELECT
2424
name,
2525
age,
2626
income
27-
FROM employee TRAIN DNNRegressor
27+
FROM employee TO TRAIN DNNRegressor
2828
WITH hidden_layers=[10,50,10]
2929
COLUMN name, agee LABEL income;
3030
```

doc/design/design_elasticdl_on_sqlflow.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ This is a design doc on integration with [ElasticDL](https://github.com/wangkuiy
1212
SELECT
1313
c1, c2, c3, c4, c5 as class
1414
FROM training_data
15-
TRAIN ElasticDLKerasClassifier
15+
TO TRAIN ElasticDLKerasClassifier
1616
WITH
1717
optimizer = "optimizer",
1818
loss = "loss",
@@ -47,7 +47,7 @@ Users can provide run-time configurations to ElasticDL job via additional parame
4747
SELECT
4848
c1, c2, c3, c4, c5 as class
4949
FROM training_data
50-
TRAIN ElasticDLKerasClassifier
50+
TO TRAIN ElasticDLKerasClassifier
5151
WITH
5252
optimizer = "optimizer",
5353
loss = "loss",
@@ -73,7 +73,7 @@ INTO trained_elasticdl_keras_classifier;
7373
Steps:
7474

7575
1. Based on `SELECT ... FROM ...`, read ODPS table and write it to [RecordIO](https://github.com/wangkuiyi/recordio) files, including both features and labels. These files will be stored in [Kubernetes Persistent Volumes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/). In the future, we will support reading ODPS table directly without having to convert it to RecordIO files.
76-
2. Generate model definition file (e.g. [cifar10_functional_api.py](https://github.com/wangkuiyi/elasticdl/blob/develop/model_zoo/cifar10_functional_api/cifar10_functional_api.py)) that will be used in `TRAIN` clause, which includes:
76+
2. Generate model definition file (e.g. [cifar10_functional_api.py](https://github.com/wangkuiyi/elasticdl/blob/develop/model_zoo/cifar10_functional_api/cifar10_functional_api.py)) that will be used in `TO TRAIN` clause, which includes:
7777

7878
- In model definition function e.g. `custom_model()`, we need to configure model input and output shapes correctly in `inputs = tf.keras.layers.Input(shape=<input_shape>)` (only when the model is defined using `tf.keras` functional APIs) and `outputs = tf.keras.layers.Dense(<num_classes>)`(based on `COLUMN ... LABEL ...`). For this MVP, users can provide `<input_shape>` and `<num_classes>` using `WITH` clause which will then get passed to the model constructor `custom_model(input_shape, num_classes)` via `--params` argument in ElasticDL high-level API. In the future, this will be inferred from the ODPS table.
7979
- Pass additional parameters from `WITH` clause to `custom_model()`'s instantiation, such as `optimizer` and `loss`.

doc/design/design_feature_derivation.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ assume that we are using all columns to train and no longer need to write
2424

2525
```sql
2626
SELECT * FROM creditcardfraud
27-
TRAIN DNNClassifier
27+
TO TRAIN DNNClassifier
2828
LABEL class
2929
INTO my_model_name;
3030
```
@@ -38,7 +38,7 @@ the SQL statement should look like:
3838

3939
```sql
4040
SELECT * FROM creditcardfraud
41-
TRAIN DNNClassifier
41+
TO TRAIN DNNClassifier
4242
COLUMN YOUR_NORMALIZE_FUNC(time)
4343
LABEL class
4444
INTO my_model_name;
@@ -64,7 +64,7 @@ statement, the SQL statement for the above case should be like:
6464

6565
```sql
6666
SELECT * FROM training_table
67-
TRAIN DNNClassifier
67+
TO TRAIN DNNClassifier
6868
COLUMN EMBEDDING(c, 128, "sum"),
6969
EMBEDDING(SPARSE(d, [1000000]), 512, "sum")
7070
LABEL label
@@ -75,7 +75,7 @@ You can also write the full description of every column like below:
7575

7676
```sql
7777
SELECT a, b, c, d, label FROM training_table
78-
TRAIN DNNClassifier
78+
TO TRAIN DNNClassifier
7979
COLUMN a, b,
8080
EMBEDDING(DENSE(c, [64]), 128, "sum"),
8181
EMBEDDING(SPARSE(d, [1000000]), 512, "sum")

doc/design/design_high-available_sqlflow_server.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -196,11 +196,11 @@ func (r *KubernetesJobRunner) fetch(jobID string) (*pb.Result, error) (
196196
197197
### Store the Trained Model
198198
199-
For example, a tinny `TRAIN` statement:
199+
For example, a tinny `TO TRAIN` statement:
200200
201201
``` sql
202202
SELECT ...
203-
TRAIN DNNClassifer
203+
TO TRAIN DNNClassifer
204204
WITH
205205
...
206206
COLUMN ...
@@ -209,7 +209,7 @@ INTO sqlflow_model
209209
210210
This SQL statment would save the model named `sqlflow_model` which contains two parts:
211211
212-
1. The `TRAIN` statement, which would be saved as a `.mod` file.
212+
1. The `TO TRAIN` statement, which would be saved as a `.mod` file.
213213
1. The Model weights, which would be saved as a `.tar.gz` file.
214214
215215
An example of a trained model folder is as follows:

0 commit comments

Comments
 (0)