Skip to content

Commit 5c82169

Browse files
yanboliangmengxr
authored andcommitted
[SPARK-11958][SPARK-11957][ML][DOC] SQLTransformer user guide and example code
Add ```SQLTransformer``` user guide, example code and make Scala API doc more clear. Author: Yanbo Liang <[email protected]> Closes #10006 from yanboliang/spark-11958. (cherry picked from commit 4a39b5a) Signed-off-by: Xiangrui Meng <[email protected]>
1 parent 8652fc0 commit 5c82169

File tree

5 files changed

+212
-2
lines changed

5 files changed

+212
-2
lines changed

docs/ml-features.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -756,6 +756,65 @@ for more details on the API.
756756
</div>
757757
</div>
758758

759+
## SQLTransformer
760+
761+
`SQLTransformer` implements the transformations which are defined by SQL statement.
762+
Currently we only support SQL syntax like `"SELECT ... FROM __THIS__ ..."`
763+
where `"__THIS__"` represents the underlying table of the input dataset.
764+
The select clause specifies the fields, constants, and expressions to display in
765+
the output, it can be any select clause that Spark SQL supports. Users can also
766+
use Spark SQL built-in function and UDFs to operate on these selected columns.
767+
For example, `SQLTransformer` supports statements like:
768+
769+
* `SELECT a, a + b AS a_b FROM __THIS__`
770+
* `SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5`
771+
* `SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b`
772+
773+
**Examples**
774+
775+
Assume that we have the following DataFrame with columns `id`, `v1` and `v2`:
776+
777+
~~~~
778+
id | v1 | v2
779+
----|-----|-----
780+
0 | 1.0 | 3.0
781+
2 | 2.0 | 5.0
782+
~~~~
783+
784+
This is the output of the `SQLTransformer` with statement `"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"`:
785+
786+
~~~~
787+
id | v1 | v2 | v3 | v4
788+
----|-----|-----|-----|-----
789+
0 | 1.0 | 3.0 | 4.0 | 3.0
790+
2 | 2.0 | 5.0 | 7.0 |10.0
791+
~~~~
792+
793+
<div class="codetabs">
794+
<div data-lang="scala" markdown="1">
795+
796+
Refer to the [SQLTransformer Scala docs](api/scala/index.html#org.apache.spark.ml.feature.SQLTransformer)
797+
for more details on the API.
798+
799+
{% include_example scala/org/apache/spark/examples/ml/SQLTransformerExample.scala %}
800+
</div>
801+
802+
<div data-lang="java" markdown="1">
803+
804+
Refer to the [SQLTransformer Java docs](api/java/org/apache/spark/ml/feature/SQLTransformer.html)
805+
for more details on the API.
806+
807+
{% include_example java/org/apache/spark/examples/ml/JavaSQLTransformerExample.java %}
808+
</div>
809+
810+
<div data-lang="python" markdown="1">
811+
812+
Refer to the [SQLTransformer Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.SQLTransformer) for more details on the API.
813+
814+
{% include_example python/ml/sql_transformer.py %}
815+
</div>
816+
</div>
817+
759818
## VectorAssembler
760819

761820
`VectorAssembler` is a transformer that combines a given list of columns into a single vector
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.examples.ml;
19+
20+
// $example on$
21+
import java.util.Arrays;
22+
23+
import org.apache.spark.SparkConf;
24+
import org.apache.spark.api.java.JavaRDD;
25+
import org.apache.spark.api.java.JavaSparkContext;
26+
import org.apache.spark.ml.feature.SQLTransformer;
27+
import org.apache.spark.sql.DataFrame;
28+
import org.apache.spark.sql.Row;
29+
import org.apache.spark.sql.RowFactory;
30+
import org.apache.spark.sql.SQLContext;
31+
import org.apache.spark.sql.types.*;
32+
// $example off$
33+
34+
public class JavaSQLTransformerExample {
35+
public static void main(String[] args) {
36+
37+
SparkConf conf = new SparkConf().setAppName("JavaSQLTransformerExample");
38+
JavaSparkContext jsc = new JavaSparkContext(conf);
39+
SQLContext sqlContext = new SQLContext(jsc);
40+
41+
// $example on$
42+
JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
43+
RowFactory.create(0, 1.0, 3.0),
44+
RowFactory.create(2, 2.0, 5.0)
45+
));
46+
StructType schema = new StructType(new StructField [] {
47+
new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
48+
new StructField("v1", DataTypes.DoubleType, false, Metadata.empty()),
49+
new StructField("v2", DataTypes.DoubleType, false, Metadata.empty())
50+
});
51+
DataFrame df = sqlContext.createDataFrame(jrdd, schema);
52+
53+
SQLTransformer sqlTrans = new SQLTransformer().setStatement(
54+
"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__");
55+
56+
sqlTrans.transform(df).show();
57+
// $example off$
58+
}
59+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
from __future__ import print_function
19+
20+
from pyspark import SparkContext
21+
# $example on$
22+
from pyspark.ml.feature import SQLTransformer
23+
# $example off$
24+
from pyspark.sql import SQLContext
25+
26+
if __name__ == "__main__":
27+
sc = SparkContext(appName="SQLTransformerExample")
28+
sqlContext = SQLContext(sc)
29+
30+
# $example on$
31+
df = sqlContext.createDataFrame([
32+
(0, 1.0, 3.0),
33+
(2, 2.0, 5.0)
34+
], ["id", "v1", "v2"])
35+
sqlTrans = SQLTransformer(
36+
statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
37+
sqlTrans.transform(df).show()
38+
# $example off$
39+
40+
sc.stop()
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
// scalastyle:off println
19+
package org.apache.spark.examples.ml
20+
21+
// $example on$
22+
import org.apache.spark.ml.feature.SQLTransformer
23+
// $example off$
24+
import org.apache.spark.sql.SQLContext
25+
import org.apache.spark.{SparkConf, SparkContext}
26+
27+
28+
object SQLTransformerExample {
29+
def main(args: Array[String]) {
30+
val conf = new SparkConf().setAppName("SQLTransformerExample")
31+
val sc = new SparkContext(conf)
32+
val sqlContext = new SQLContext(sc)
33+
34+
// $example on$
35+
val df = sqlContext.createDataFrame(
36+
Seq((0, 1.0, 3.0), (2, 2.0, 5.0))).toDF("id", "v1", "v2")
37+
38+
val sqlTrans = new SQLTransformer().setStatement(
39+
"SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
40+
41+
sqlTrans.transform(df).show()
42+
// $example off$
43+
}
44+
}
45+
// scalastyle:on println

mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,16 @@ import org.apache.spark.sql.types.StructType
2727

2828
/**
2929
* :: Experimental ::
30-
* Implements the transforms which are defined by SQL statement.
31-
* Currently we only support SQL syntax like 'SELECT ... FROM __THIS__'
30+
* Implements the transformations which are defined by SQL statement.
31+
* Currently we only support SQL syntax like 'SELECT ... FROM __THIS__ ...'
3232
* where '__THIS__' represents the underlying table of the input dataset.
33+
* The select clause specifies the fields, constants, and expressions to display in
34+
* the output, it can be any select clause that Spark SQL supports. Users can also
35+
* use Spark SQL built-in function and UDFs to operate on these selected columns.
36+
* For example, [[SQLTransformer]] supports statements like:
37+
* - SELECT a, a + b AS a_b FROM __THIS__
38+
* - SELECT a, SQRT(b) AS b_sqrt FROM __THIS__ where a > 5
39+
* - SELECT a, b, SUM(c) AS c_sum FROM __THIS__ GROUP BY a, b
3340
*/
3441
@Experimental
3542
@Since("1.6.0")

0 commit comments

Comments
 (0)