[Syntax] Change PREDICT to TO PREDICT (#1015)

tonyyang-svail · wangkuiyi · commit 32fb9df38453 · 2019-10-15T14:59:32.000-07:00
* pass all tests in pkg/sql

* pass lexer test

* pass tests at pkg/sql/
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ Done training
 ```sql
 sqlflow> SELECT *
 FROM iris.test
-PREDICT iris.predict.class
+TO PREDICT iris.predict.class
 USING sqlflow_models.my_dnn_model;
 
 ...
diff --git a/cmd/sqlflowserver/main_test.go b/cmd/sqlflowserver/main_test.go
@@ -551,7 +551,7 @@ INTO sqlflow_models.my_dnn_model;`, caseDB, caseTrainTable)
 	ParseRow(stream)
 	predSQL := fmt.Sprintf(`SELECT *
 FROM %s.%s
-PREDICT %s.%s.class
+TO PREDICT %s.%s.class
 USING sqlflow_models.my_dnn_model;`, caseDB, caseTestTable, caseDB, casePredictTable)
 
 	stream, err = cli.Run(ctx, sqlRequest(predSQL))
@@ -610,7 +610,7 @@ INTO sqlflow_models.my_dnn_model_custom;`
 
 	predSQL := `SELECT *
 FROM iris.test
-PREDICT iris.predict.class
+TO PREDICT iris.predict.class
 USING sqlflow_models.my_dnn_model_custom;`
 
 	stream, err = cli.Run(ctx, sqlRequest(predSQL))
@@ -1014,7 +1014,7 @@ INTO sqlflow_models.my_regression_model;`)
 
 	predSQL := fmt.Sprintf(`SELECT *
 FROM housing.test
-PREDICT housing.predict.target
+TO PREDICT housing.predict.target
 USING sqlflow_models.my_regression_model;`)
 
 	stream, err = cli.Run(ctx, sqlRequest(predSQL))
@@ -1090,7 +1090,7 @@ func CasePredictXGBoostRegression(t *testing.T) {
 
 	predSQL := fmt.Sprintf(`SELECT *
 FROM housing.test
-PREDICT housing.xgb_predict.target
+TO PREDICT housing.xgb_predict.target
 USING sqlflow_models.my_xgb_regression_model;`)
 
 	stream, err := cli.Run(ctx, sqlRequest(predSQL))
diff --git a/doc/design/design_ant_xgboost.md b/doc/design/design_ant_xgboost.md
@@ -62,7 +62,7 @@ INTO sqlflow_models.xgboost_model_table;
 select 
     c1, c2, c3, c4
 from kaggle_credit_fraud_development_data
-PREDICT kaggle_credit_fraud_development_data.class
+TO PREDICT kaggle_credit_fraud_development_data.class
 USING sqlflow_models.xgboost_model_table;
 ```
 
diff --git a/doc/design/design_clustermodel.md b/doc/design/design_clustermodel.md
@@ -38,12 +38,12 @@ USING existed_pretrain_model
 INTO my_cluster_model;
 ```
 
-PREDICT SQL:
+TO PREDICT SQL:
 
 ``` sql
 SELECT *
 FROM input_table
-PREDICT output_table.group_id
+TO PREDICT output_table.group_id
 USING my_cluster_model;
 ```
 
@@ -108,7 +108,7 @@ Therefore, there are four cases in total:
 
 - In the first stage of the clustering model on SQLFlow, we plan to achieve the `first case`. We will achieve the other cases in the later. 
 
-- Users can use the trained cluster model in ` PREDICT SQL` to predict the group of input_table to get output_table.
+- Users can use the trained cluster model in ` TO PREDICT SQL` to predict the group of input_table to get output_table.
 
 - Finally, the user can perform a combined aggregation operation on the output_table based on the SQL statement to obtain a result_table, which can be saved to the local dataframe and then analyzed according to his own needs.
 
diff --git a/doc/design/design_database_abstraction_layer.md b/doc/design/design_database_abstraction_layer.md
@@ -8,7 +8,7 @@ SQLFlow calls Go's [standard database API](https://golang.org/pkg/database/sql/)
 
 ### Data Retrieval
 
-The basic idea of SQLFlow is to extend the SELECT statement of SQL to have the TO TRAIN and PREDICT clauses.  For more discussion, please refer to the [syntax design](/doc/design/design_syntax.md).  SQLFlow translates such "extended SQL statements" into submitter programs, which forward the part from SELECT to TO TRAIN or PREDICT, which we call the "standard part", to the SQL engine.  SQLFlow also accepts the SELECT statement without TO TRAIN or PREDICT clauses and would forward such "standard statements" to the engine.  It is noticeable that the "standard part" or "standard statements" are not standardized.  For example, various engines use different syntax for `FULL OUTER JOIN`.
+The basic idea of SQLFlow is to extend the SELECT statement of SQL to have the TO TRAIN and TO PREDICT clauses.  For more discussion, please refer to the [syntax design](/doc/design/design_syntax.md).  SQLFlow translates such "extended SQL statements" into submitter programs, which forward the part from SELECT to TO TRAIN or TO PREDICT, which we call the "standard part", to the SQL engine.  SQLFlow also accepts the SELECT statement without TO TRAIN or TO PREDICT clauses and would forward such "standard statements" to the engine.  It is noticeable that the "standard part" or "standard statements" are not standardized.  For example, various engines use different syntax for `FULL OUTER JOIN`.
 
 - Hive supports `FULL OUTER JOIN` directly.
 - MySQL doesn't have `FULL OUTER JOIN`. However, a user can emulates `FULL OUTER JOIN` using `LEFT JOIN`, `UNION` and `RIGHT JOIN`.
diff --git a/doc/design/design_elasticdl_on_sqlflow.md b/doc/design/design_elasticdl_on_sqlflow.md
@@ -33,7 +33,7 @@ INTO trained_elasticdl_keras_classifier;
 SELECT
     c1, c2, c3, c4
 FROM prediction_data
-PREDICT prediction_results_table
+TO PREDICT prediction_results_table
 WITH
   num_classes = 10
 USING trained_elasticdl_keras_classifier;
diff --git a/doc/design/design_submitter.md b/doc/design/design_submitter.md
@@ -36,7 +36,7 @@ type TrainDescription struct {
 
 // SELECT *
 // FROM iris.test
-// PREDICT iris.predict.class
+// TO PREDICT iris.predict.class
 // USING sqlflow_models.my_dnn_model;
 type PredDescription struct {
     StandardSelect string // e.g. SELECT * FROM iris.test
diff --git a/doc/design/design_support_multiple_sql_statements.md b/doc/design/design_support_multiple_sql_statements.md
@@ -20,7 +20,7 @@ While splitting at the client-side is relatively simple to implement. We prefer
 
 ### Splitting Technique: Hybrid Parser vs. Lexer
 
-The hybrid parser solution uses the third-party SQL parser (like [TiDB parser](https://github.com/pingcap/parser/blob/master/parser.y)) and SQLFlow parser to determine the end of an SQL statement. The third-party SQL parser first parses the extended SQL statement. It will raise error near SQLFlow extended keywords, like TO TRAIN and PREDICT. Then the SQLFlow parser starts from the error position and stops at the end of the first statement. However, this solution relies on the third-party SQL parser to report the error **accurately** on the keywords, like TO TRAIN and PREDICT, that it can't recognize.
+The hybrid parser solution uses the third-party SQL parser (like [TiDB parser](https://github.com/pingcap/parser/blob/master/parser.y)) and SQLFlow parser to determine the end of an SQL statement. The third-party SQL parser first parses the extended SQL statement. It will raise error near SQLFlow extended keywords, like TO TRAIN and TO PREDICT. Then the SQLFlow parser starts from the error position and stops at the end of the first statement. However, this solution relies on the third-party SQL parser to report the error **accurately** on the keywords, like TO TRAIN and TO PREDICT, that it can't recognize.
 
 The lexer solution scans the entire SQL statements, finds the `;` tokens, and splits the SQL based on the position of  `;` token.
 
diff --git a/doc/design/design_syntax.md b/doc/design/design_syntax.md
@@ -131,7 +131,7 @@ Similarly, to infer the class (fraud or regular), we could
 
 ```sql
 SELECT * FROM kaggle_credit_fraud_development_data
-PREDICT kaggle_credit_fraud_development_data.class
+TO PREDICT kaggle_credit_fraud_development_data.class
 USING sqlflow_models.my_model_table;
 ```
 
diff --git a/doc/design/design_xgboost_on_sqlflow.md b/doc/design/design_xgboost_on_sqlflow.md
@@ -24,7 +24,7 @@ The following example shows how to predict using the model `my_xgb_model`.
 
 ``` sql
 SELECT * FROM test_table
-PREDICT pred_table.result
+TO PREDICT pred_table.result
 USING my_xgb_model;
 ```
 
@@ -42,4 +42,4 @@ The code generator `codegen_xgboost.go` outputs an XGBoost program in Python. It
 1. It tells the SQL engine to run the SELECT statement and retrieve the training/test data. It saves the data into a text file, which could be loaded by XGBoost using the DMatrix interface.
 1. Parse and resolve the WITH clause to fill the `xgboost.train` arguments and the XGBoost Parameters.
 1. Save the trained model on disk.
-1. For the PREDICT clause, it loads the trained model and test data and then outputs the prediction result to a SQL engine.
+1. For the TO PREDICT clause, it loads the trained model and test data and then outputs the prediction result to a SQL engine.
diff --git a/doc/language_guide.md b/doc/language_guide.md
@@ -314,7 +314,7 @@ SELECT select_expr [, select_expr ...]
 FROM table_references
   [WHERE where_condition]
   [LIMIT row_count]
-PREDICT result_table_reference
+TO PREDICT result_table_reference
 [WITH
   attr_expr [, attr_expr ...]]
 USING model_table_reference;
@@ -329,7 +329,7 @@ The [select clause](#select-clause) syntax is the same as the select clause synt
 The *predict clause* describes the result table that a prediction job should write to, the table a prediction job should load the model from, and necessary configuration attributes for a prediction job.
 
 ```
-PREDICT result_table_reference
+TO PREDICT result_table_reference
 [WITH
   attr_expr [, attr_expr ...]]
 USING model_table_reference;
@@ -343,7 +343,7 @@ For example, if we want to save the predicted result into table `iris.predict` a
 
 ```
 SELECT ...
-PREDICT iris.predict.class
+TO PREDICT iris.predict.class
 USING sqlflow.my_dnn_model;
 ```
 
diff --git a/doc/run/repl.md b/doc/run/repl.md
@@ -51,7 +51,7 @@ To predict using the trained model, we can type the following statement.
 ```sql
 sqlflow> SELECT *
 FROM iris.test
-PREDICT iris.predict.class
+TO PREDICT iris.predict.class
 USING sqlflow_models.my_dnn_model;
 
 ...
diff --git a/doc/talk/201812/pred.sql b/doc/talk/201812/pred.sql
@@ -1,6 +1,6 @@
 sqlflow> SELECT *
 FROM iris.test
-PREDICT iris.predict.class
+TO PREDICT iris.predict.class
 USING sqlflow_models.my_dnn_model;
 -----------------------------
 2018/12/16 15:05:58 tensorflowCmd: run in Docker container
diff --git a/doc/talk/201905/pred.sql b/doc/talk/201905/pred.sql
@@ -1,6 +1,6 @@
 sqlflow> SELECT *
 FROM iris.test
-PREDICT iris.predict.class
+TO PREDICT iris.predict.class
 USING sqlflow_models.my_dnn_model;
 -----------------------------
 2018/12/16 15:05:58 tensorflowCmd: run in Docker container
diff --git a/doc/talk/201906/pred.sql b/doc/talk/201906/pred.sql
@@ -1,3 +1,3 @@
 SELECT * FROM creditcardfraud
-PREDICT creditcardfraud.predict.class
-USING my_dnn_model;
+TO PREDICT creditcardfraud.predict.class
+USING my_dnn_model;
diff --git a/doc/talk/201906/sqlflow.slide b/doc/talk/201906/sqlflow.slide
@@ -39,7 +39,7 @@ The current process:
 * Extended SQL Syntax
 - Some SQL+AI solutions use user-defined functions (UDF) to add AI features to SQL.
 - SQLFlow wants to work with many SQL engines (MySQL, Hive, SparkSQL, MaxCompute). It is intractable to reimplement a set of UDF for each of these engines.
-- SQLFlow extends SQL syntax by allowing users to append a TO TRAIN or PREDICT clause to a SELECT statement.
+- SQLFlow extends SQL syntax by allowing users to append a TO TRAIN or TO PREDICT clause to a SELECT statement.
 
 * The Architecture
 
@@ -56,7 +56,7 @@ The current process:
 * Challenges
 
 - How to convert SELECT results from a table of rows into models inputs (a tensor of float values).
-- How to call SQL engines’ native parser to parse the SELECT statement before the TO TRAIN/PREDICT clause.
+- How to call SQL engines’ native parser to parse the SELECT statement before the TO TRAIN/TO PREDICT clause.
 
 * Design: Feature Derivation
 
@@ -111,7 +111,7 @@ And this will generate feature column code like:
 
 * Design: Integrating Parsers
 
-If someone already has a complex SELECT statement for data cleaning and augmentation, he could simply add a TO TRAIN or PREDICT clause to enable AI.
+If someone already has a complex SELECT statement for data cleaning and augmentation, he could simply add a TO TRAIN or TO PREDICT clause to enable AI.
 
 Challenges:
 
diff --git a/doc/tutorial/aspara2019/activepower_clustering/README.md b/doc/tutorial/aspara2019/activepower_clustering/README.md
@@ -86,9 +86,9 @@ Firstly, we can use a standard SQL to fetch the prediction data:
 SELECT * FROM activepower.train.
 ```
 
-Next, we can specify the prediction result table by PREDICT clause:
+Next, we can specify the prediction result table by TO PREDICT clause:
 ```text
-PREDICT activepower.predict.class
+TO PREDICT activepower.predict.class
 ```
 
 Then, we can specify the trained model by USING clause:
@@ -101,7 +101,7 @@ Putting it all together, the following is the SQLFLow Prediction statement:
 %%sqlflow
 SELECT * 
 FROM activepower.train
-PREDICT activepower.predict.class
+TO PREDICT activepower.predict.class
 USING sqlflow_models.my_customized_model;
 ```
 
diff --git a/doc/tutorial/aspara2019/carprice_xgboost/README.md b/doc/tutorial/aspara2019/carprice_xgboost/README.md
@@ -106,10 +106,10 @@ First, we can specify the trained model by USING clause:
 USING sqlflow_models.my_xgb_regression_model
 ```
 
-Then, we can specify the prediction result table by PREDICT clause:
+Then, we can specify the prediction result table by TO PREDICT clause:
 
 ```text
-PREDICT carprice.predict.msrp
+TO PREDICT carprice.predict.msrp
 ```
 
 And using a standard SQL to fetch the prediction data.
@@ -123,7 +123,7 @@ Finally, the following is the SQLFLow Prediction statement:
 ```sql
 %%sqlflow
 SELECT * FROM carprice.test
-PREDICT carprice.predict.msrp
+TO PREDICT carprice.predict.msrp
 USING sqlflow_models.my_xgb_regression_model;
 ```
 
diff --git a/doc/tutorial/aspara2019/titanic_dnn/README.md b/doc/tutorial/aspara2019/titanic_dnn/README.md
@@ -106,7 +106,7 @@ Say we want to use the model stored at `sqlflow_models.my_dnn_model`, and read t
 %%sqlflow
 SELECT *
 FROM titanic.test
-PREDICT titanic.predict.survived
+TO PREDICT titanic.predict.survived
 USING sqlflow_models.my_dnn_model;
 
 SELECT *
diff --git a/doc/tutorial/fraud-dnn.md b/doc/tutorial/fraud-dnn.md
@@ -34,7 +34,7 @@ to do predict:
 %%sqlflow
 SELECT * from creditcard.creditcard
 WHERE class=1
-PREDICT creditcard.predict.class
+TO PREDICT creditcard.predict.class
 USING creditcard.creditcard_deep_model;
 ```
 
diff --git a/doc/tutorial/housing-xgboost.md b/doc/tutorial/housing-xgboost.md
@@ -95,10 +95,10 @@ First, we can specify the trained model by `USING clause`:
 USING sqlflow_models.my_xgb_regression_model
 ```
 
-Than, we can specify the prediction result table by `PREDICT clause`:
+Than, we can specify the prediction result table by `TO PREDICT clause`:
 
 ```
-PREDICT boston.predict.medv
+TO PREDICT boston.predict.medv
 ```
 
 And using a standar SQL to fetch the prediction data:
@@ -112,7 +112,7 @@ Finally, the following is the SQLFLow Prediction statment:
 ```sql
 %%sqlflow
 SELECT * FROM boston.test
-PREDICT boston.predict.medv
+TO PREDICT boston.predict.medv
 USING sqlflow_models.my_xgb_regression_model;
 ```
 
diff --git a/doc/tutorial/imdb-stackedbilstm.md b/doc/tutorial/imdb-stackedbilstm.md
@@ -37,7 +37,7 @@ segmented by spaces. You can download the full dataset from:
     %%sqlflow
     SELECT *
     FROM imdb.test
-    PREDICT imdb.predict.class
+    TO PREDICT imdb.predict.class
     USING sqlflow_models.my_text_model_en;
     ```
 1. Then you can get predict result from table `imdb.predict`.
@@ -151,7 +151,7 @@ you may need to follow the below steps:
     %%sqlflow
     SELECT *
     FROM toutiao.test_processed
-    PREDICT toutiao.predict.class_id
+    TO PREDICT toutiao.predict.class_id
     USING sqlflow_models.my_text_model;
     ```
 1. Then you can get predict result from table `toutiao.predict`:
diff --git a/doc/walkthrough.md b/doc/walkthrough.md
@@ -14,10 +14,10 @@ SELECT * FROM a_table TO TRAIN DNNClassifier WITH learning_rate=0.01 INTO sqlflo
 And the following statement uses the trained model for prediction.
 
 ```sql
-SELECT * FROM b_table PREDICT b_table.predicted_label USING sqlflow_models.my_model;
+SELECT * FROM b_table TO PREDICT b_table.predicted_label USING sqlflow_models.my_model;
 ```
 
-Please be aware that the part in the above statements before the extended keyword TO TRAIN and PREDICT is a standard SQL statement. This feature simplifies the implementation of the SQLFlow system.
+Please be aware that the part in the above statements before the extended keyword TO TRAIN and TO PREDICT is a standard SQL statement. This feature simplifies the implementation of the SQLFlow system.
 
 ## System Implementation
 
diff --git a/parser/README.md b/parser/README.md
@@ -7,10 +7,10 @@ Typical SQLFlow statements are like the following.
 ```sql
 SELECT * FROM training_data TO TRAIN DNNClassifier LABEL kind INTO my_model;
 
-SELECT * FROM testing_data PREDICT testing_data.predicted_kind USING my_model;
+SELECT * FROM testing_data TO PREDICT testing_data.predicted_kind USING my_model;
 ```
 
-The point is, assuming someone already has a SELECT statement for data cleaning and augmentation, (s)he could add a TO TRAIN or PREDICT clause to enable AI, no matter how complex the statement is or if it is hundreds of lines of code including nested SELECT.
+The point is, assuming someone already has a SELECT statement for data cleaning and augmentation, (s)he could add a TO TRAIN or TO PREDICT clause to enable AI, no matter how complex the statement is or if it is hundreds of lines of code including nested SELECT.
 
 A significant challenge for SQLFlow here is to parse the SQL. Many SQL engines, such as MySQL, Oracle, Hive, SparkSQL, Flink, claim they are compatible with ANSI SQL but most have unique features.
 
@@ -49,7 +49,7 @@ func Parse(sql string) (acceptable bool, err error) {
         return false, errRight 
     }
 
-    // The left part is a SELECT and the right part is TO TRAIN or PREDICT.
+    // The left part is a SELECT and the right part is TO TRAIN or TO PREDICT.
     return false, nil 
 }
 ```
diff --git a/pkg/sql/codegen_alps_test.go b/pkg/sql/codegen_alps_test.go
@@ -93,7 +93,7 @@ func TestPredALPSFiller(t *testing.T) {
 	os.Setenv("OSS_ID", "sqlflow_id")
 	os.Setenv("OSS_ENDPOINT", "http://sqlflow-oss-endpoint")
 	predStatement := `SELECT predict_fun(concat(",", col_1, col_2)) AS (info, score) FROM db.table
-		PREDICT db.predict_result
+		TO PREDICT db.predict_result
 		USING sqlflow_model;`
 
 	r, e := parser.Parse(predStatement)
diff --git a/pkg/sql/codegen_elasticdl_test.go b/pkg/sql/codegen_elasticdl_test.go
@@ -94,7 +94,7 @@ func TestPredElasticDLFiller(t *testing.T) {
 	a := assert.New(t)
 	parser := newParser()
 	predStatement := `SELECT sepal_length, sepal_width, petal_length, petal_width FROM iris.test
-		PREDICT prediction_results_table
+		TO PREDICT prediction_results_table
 		WITH
 			model.num_classes = 10
 		USING trained_elasticdl_keras_classifier;`
diff --git a/pkg/sql/codegen_test.go b/pkg/sql/codegen_test.go
@@ -39,7 +39,7 @@ INTO sqlflow_models.my_dnn_model;
 	testPredictSelectIris = `
 SELECT *
 FROM iris.test
-predict iris.predict.class
+TO PREDICT iris.predict.class
 USING sqlflow_models.my_dnn_model;
 `
 	testClusteringTrain = testSelectIris + `
@@ -55,7 +55,7 @@ INTO sqlflow_models.my_clustering_model;
 	testClusteringPredict = `
 SELECT *
 FROM iris.test
-PREDICT iris.predict.class
+TO PREDICT iris.predict.class
 USING sqlflow_models.my_clustering_model;
 `
 )
diff --git a/pkg/sql/codegen_xgboost_test.go b/pkg/sql/codegen_xgboost_test.go
diff --git a/pkg/sql/executor_test.go b/pkg/sql/executor_test.go
diff --git a/pkg/sql/lexer_test.go b/pkg/sql/lexer_test.go
diff --git a/pkg/sql/parser.go b/pkg/sql/parser.go
diff --git a/pkg/sql/parser_test.go b/pkg/sql/parser_test.go
diff --git a/pkg/sql/sql.y b/pkg/sql/sql.y
diff --git a/pkg/sql/verifier_test.go b/pkg/sql/verifier_test.go