Skip to content

Commit a728bf2

Browse files
committed
Example rename.
1 parent e8aa3d3 commit a728bf2

File tree

9 files changed

+51
-44
lines changed

9 files changed

+51
-44
lines changed

examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ public static void main(String[] args) {
4848

4949
// Prepare training data.
5050
// We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
51-
// into SchemaRDDs, where it uses the bean metadata to infer the schema.
51+
// into DataFrames, where it uses the bean metadata to infer the schema.
5252
List<LabeledPoint> localTraining = Lists.newArrayList(
5353
new LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
5454
new LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),

examples/src/main/java/org/apache/spark/examples/sql/JavaSparkSQL.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ public Person call(String line) {
8080
// SQL can be run over RDDs that have been registered as tables.
8181
DataFrame teenagers = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
8282

83-
// The results of SQL queries are SchemaRDDs and support all the normal RDD operations.
83+
// The results of SQL queries are DataFrames and support all the normal RDD operations.
8484
// The columns of a row in the result can be accessed by ordinal.
8585
List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() {
8686
@Override
@@ -93,12 +93,12 @@ public String call(Row row) {
9393
}
9494

9595
System.out.println("=== Data source: Parquet File ===");
96-
// JavaSchemaRDDs can be saved as parquet files, maintaining the schema information.
96+
// DataFrames can be saved as parquet files, maintaining the schema information.
9797
schemaPeople.saveAsParquetFile("people.parquet");
9898

9999
// Read in the parquet file created above.
100100
// Parquet files are self-describing so the schema is preserved.
101-
// The result of loading a parquet file is also a JavaSchemaRDD.
101+
// The result of loading a parquet file is also a DataFrame.
102102
DataFrame parquetFile = sqlCtx.parquetFile("people.parquet");
103103

104104
//Parquet files can also be registered as tables and then used in SQL statements.
@@ -119,7 +119,7 @@ public String call(Row row) {
119119
// A JSON dataset is pointed by path.
120120
// The path can be either a single text file or a directory storing text files.
121121
String path = "examples/src/main/resources/people.json";
122-
// Create a JavaSchemaRDD from the file(s) pointed by path
122+
// Create a DataFrame from the file(s) pointed by path
123123
DataFrame peopleFromJsonFile = sqlCtx.jsonFile(path);
124124

125125
// Because the schema of a JSON dataset is automatically inferred, to write queries,
@@ -130,13 +130,13 @@ public String call(Row row) {
130130
// |-- age: IntegerType
131131
// |-- name: StringType
132132

133-
// Register this JavaSchemaRDD as a table.
133+
// Register this DataFrame as a table.
134134
peopleFromJsonFile.registerTempTable("people");
135135

136136
// SQL statements can be run by using the sql methods provided by sqlCtx.
137137
DataFrame teenagers3 = sqlCtx.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19");
138138

139-
// The results of SQL queries are JavaSchemaRDDs and support all the normal RDD operations.
139+
// The results of SQL queries are DataFrame and support all the normal RDD operations.
140140
// The columns of a row in the result can be accessed by ordinal.
141141
teenagerNames = teenagers3.toJavaRDD().map(new Function<Row, String>() {
142142
@Override
@@ -146,14 +146,14 @@ public String call(Row row) {
146146
System.out.println(name);
147147
}
148148

149-
// Alternatively, a JavaSchemaRDD can be created for a JSON dataset represented by
149+
// Alternatively, a DataFrame can be created for a JSON dataset represented by
150150
// a RDD[String] storing one JSON object per string.
151151
List<String> jsonData = Arrays.asList(
152152
"{\"name\":\"Yin\",\"address\":{\"city\":\"Columbus\",\"state\":\"Ohio\"}}");
153153
JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
154154
DataFrame peopleFromJsonRDD = sqlCtx.jsonRDD(anotherPeopleRDD.rdd());
155155

156-
// Take a look at the schema of this new JavaSchemaRDD.
156+
// Take a look at the schema of this new DataFrame.
157157
peopleFromJsonRDD.printSchema();
158158
// The schema of anotherPeople is ...
159159
// root

examples/src/main/python/mllib/dataset_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#
1717

1818
"""
19-
An example of how to use SchemaRDD as a dataset for ML. Run with::
19+
An example of how to use DataFrame as a dataset for ML. Run with::
2020
bin/spark-submit examples/src/main/python/mllib/dataset_example.py
2121
"""
2222

examples/src/main/python/sql.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,26 +30,26 @@
3030
some_rdd = sc.parallelize([Row(name="John", age=19),
3131
Row(name="Smith", age=23),
3232
Row(name="Sarah", age=18)])
33-
# Infer schema from the first row, create a SchemaRDD and print the schema
34-
some_schemardd = sqlContext.inferSchema(some_rdd)
35-
some_schemardd.printSchema()
33+
# Infer schema from the first row, create a DataFrame and print the schema
34+
some_df = sqlContext.inferSchema(some_rdd)
35+
some_df.printSchema()
3636

3737
# Another RDD is created from a list of tuples
3838
another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
3939
# Schema with two fields - person_name and person_age
4040
schema = StructType([StructField("person_name", StringType(), False),
4141
StructField("person_age", IntegerType(), False)])
42-
# Create a SchemaRDD by applying the schema to the RDD and print the schema
43-
another_schemardd = sqlContext.applySchema(another_rdd, schema)
44-
another_schemardd.printSchema()
42+
# Create a DataFrame by applying the schema to the RDD and print the schema
43+
another_df = sqlContext.applySchema(another_rdd, schema)
44+
another_df.printSchema()
4545
# root
4646
# |-- age: integer (nullable = true)
4747
# |-- name: string (nullable = true)
4848

4949
# A JSON dataset is pointed to by path.
5050
# The path can be either a single text file or a directory storing text files.
5151
path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json")
52-
# Create a SchemaRDD from the file(s) pointed to by path
52+
# Create a DataFrame from the file(s) pointed to by path
5353
people = sqlContext.jsonFile(path)
5454
# root
5555
# |-- person_name: string (nullable = false)
@@ -61,7 +61,7 @@
6161
# |-- age: IntegerType
6262
# |-- name: StringType
6363

64-
# Register this SchemaRDD as a table.
64+
# Register this DataFrame as a table.
6565
people.registerAsTable("people")
6666

6767
# SQL statements can be run by using the sql methods provided by sqlContext

examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ object SimpleParamsExample {
4141

4242
// Prepare training data.
4343
// We use LabeledPoint, which is a case class. Spark SQL can convert RDDs of Java Beans
44-
// into SchemaRDDs, where it uses the bean metadata to infer the schema.
44+
// into DataFrames, where it uses the bean metadata to infer the schema.
4545
val training = sparkContext.parallelize(Seq(
4646
LabeledPoint(1.0, Vectors.dense(0.0, 1.1, 0.1)),
4747
LabeledPoint(0.0, Vectors.dense(2.0, 1.0, -1.0)),

examples/src/main/scala/org/apache/spark/examples/mllib/DatasetExample.scala

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ object DatasetExample {
4747
val defaultParams = Params()
4848

4949
val parser = new OptionParser[Params]("DatasetExample") {
50-
head("Dataset: an example app using SchemaRDD as a Dataset for ML.")
50+
head("Dataset: an example app using DataFrame as a Dataset for ML.")
5151
opt[String]("input")
5252
.text(s"input path to dataset")
5353
.action((x, c) => c.copy(input = x))
@@ -80,20 +80,20 @@ object DatasetExample {
8080
}
8181
println(s"Loaded ${origData.count()} instances from file: ${params.input}")
8282

83-
// Convert input data to SchemaRDD explicitly.
84-
val schemaRDD: DataFrame = origData
85-
println(s"Inferred schema:\n${schemaRDD.schema.prettyJson}")
86-
println(s"Converted to SchemaRDD with ${schemaRDD.count()} records")
83+
// Convert input data to DataFrame explicitly.
84+
val df: DataFrame = origData.toDF
85+
println(s"Inferred schema:\n${df.schema.prettyJson}")
86+
println(s"Converted to DataFrame with ${df.count()} records")
8787

88-
// Select columns, using implicit conversion to SchemaRDD.
89-
val labelsSchemaRDD: DataFrame = origData.select("label")
90-
val labels: RDD[Double] = labelsSchemaRDD.map { case Row(v: Double) => v }
88+
// Select columns, using implicit conversion to DataFrames.
89+
val labelsDf: DataFrame = origData.select("label")
90+
val labels: RDD[Double] = labelsDf.map { case Row(v: Double) => v }
9191
val numLabels = labels.count()
9292
val meanLabel = labels.fold(0.0)(_ + _) / numLabels
9393
println(s"Selected label column with average value $meanLabel")
9494

95-
val featuresSchemaRDD: DataFrame = origData.select("features")
96-
val features: RDD[Vector] = featuresSchemaRDD.map { case Row(v: Vector) => v }
95+
val featuresDf: DataFrame = origData.select("features")
96+
val features: RDD[Vector] = featuresDf.map { case Row(v: Vector) => v }
9797
val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
9898
(summary, feat) => summary.add(feat),
9999
(sum1, sum2) => sum1.merge(sum2))
@@ -103,7 +103,7 @@ object DatasetExample {
103103
tmpDir.deleteOnExit()
104104
val outputDir = new File(tmpDir, "dataset").toString
105105
println(s"Saving to $outputDir as Parquet file.")
106-
schemaRDD.saveAsParquetFile(outputDir)
106+
df.saveAsParquetFile(outputDir)
107107

108108
println(s"Loading Parquet file with UDT from $outputDir.")
109109
val newDataset = sqlContext.parquetFile(outputDir)

sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,11 @@ class DataFrame protected[sql](
458458
*/
459459
override def head(): Row = head(1).head
460460

461+
/**
462+
* Return the first row. Alias for head().
463+
*/
464+
override def first(): Row = head()
465+
461466
override def map[R: ClassTag](f: Row => R): RDD[R] = {
462467
rdd.map(f)
463468
}

sql/core/src/main/scala/org/apache/spark/sql/api.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ trait RDDApi[T] {
5454

5555
def count(): Long
5656

57+
def first(): T
58+
5759
def repartition(numPartitions: Int): DataFrame
5860
}
5961

sql/core/src/test/scala/org/apache/spark/sql/DslQuerySuite.scala

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ class DslQuerySuite extends QueryTest {
3434
testData.collect().toSeq)
3535
}
3636

37-
// test("repartition") {
38-
// checkAnswer(
39-
// testData.select('key).repartition(10).select('key),
40-
// testData.select('key).collect().toSeq)
41-
// }
37+
test("repartition") {
38+
checkAnswer(
39+
testData.select('key).repartition(10).select('key),
40+
testData.select('key).collect().toSeq)
41+
}
4242

4343
test("agg") {
4444
checkAnswer(
@@ -266,15 +266,15 @@ class DslQuerySuite extends QueryTest {
266266
checkAnswer(lowerCaseData.intersect(upperCaseData), Nil)
267267
}
268268

269-
// test("udf") {
270-
// val foo = (a: Int, b: String) => a.toString + b
271-
//
272-
// checkAnswer(
273-
// // SELECT *, foo(key, value) FROM testData
274-
// testData.select(Star(None), foo.call('key, 'value)).limit(3),
275-
// Row(1, "1", "11") :: Row(2, "2", "22") :: Row(3, "3", "33") :: Nil
276-
// )
277-
// }
269+
test("udf") {
270+
val foo = (a: Int, b: String) => a.toString + b
271+
272+
checkAnswer(
273+
// SELECT *, foo(key, value) FROM testData
274+
testData.select($"*", callUDF(foo, 'key, 'value)).limit(3),
275+
Row(1, "1", "11") :: Row(2, "2", "22") :: Row(3, "3", "33") :: Nil
276+
)
277+
}
278278

279279
test("sqrt") {
280280
checkAnswer(

0 commit comments

Comments
 (0)