Skip to content

Commit d27174b

Browse files
committed
Merge branch 'master' of https://github.com/apache/spark into MaxPermSize
2 parents 42a8c3f + 640f63b commit d27174b

File tree

52 files changed

+1528
-337
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+1528
-337
lines changed

R/pkg/R/DataFrame.R

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ setMethod("isLocal",
150150
callJMethod(x@sdf, "isLocal")
151151
})
152152

153-
#' ShowDF
153+
#' showDF
154154
#'
155155
#' Print the first numRows rows of a DataFrame
156156
#'
@@ -170,7 +170,8 @@ setMethod("isLocal",
170170
setMethod("showDF",
171171
signature(x = "DataFrame"),
172172
function(x, numRows = 20) {
173-
callJMethod(x@sdf, "showString", numToInt(numRows))
173+
s <- callJMethod(x@sdf, "showString", numToInt(numRows))
174+
cat(s)
174175
})
175176

176177
#' show
@@ -187,7 +188,7 @@ setMethod("showDF",
187188
#' sqlCtx <- sparkRSQL.init(sc)
188189
#' path <- "path/to/file.json"
189190
#' df <- jsonFile(sqlCtx, path)
190-
#' show(df)
191+
#' df
191192
#'}
192193
setMethod("show", "DataFrame",
193194
function(object) {

R/pkg/R/group.R

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,7 @@ setMethod("agg",
102102
}
103103
}
104104
jcols <- lapply(cols, function(c) { c@jc })
105-
# the GroupedData.agg(col, cols*) API does not contain grouping Column
106-
sdf <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "aggWithGrouping",
107-
x@sgd, listToSeq(jcols))
105+
sdf <- callJMethod(x@sgd, "agg", jcols[[1]], listToSeq(jcols[-1]))
108106
} else {
109107
stop("agg can only support Column or character")
110108
}

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,8 @@ test_that("toJSON() returns an RDD of the correct values", {
653653

654654
test_that("showDF()", {
655655
df <- jsonFile(sqlCtx, jsonPath)
656-
expect_output(showDF(df), "+----+-------+\n| age| name|\n+----+-------+\n|null|Michael|\n| 30| Andy|\n| 19| Justin|\n+----+-------+\n")
656+
s <- capture.output(showDF(df))
657+
expect_output(s , "+----+-------+\n| age| name|\n+----+-------+\n|null|Michael|\n| 30| Andy|\n| 19| Justin|\n+----+-------+\n")
657658
})
658659

659660
test_that("isLocal()", {

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ can be run using:
7676
./dev/run-tests
7777

7878
Please see the guidance on how to
79-
[run all automated tests](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-AutomatedTesting).
79+
[run tests for a module, or individual tests](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools).
8080

8181
## A Note About Hadoop Versions
8282

core/src/main/scala/org/apache/spark/ui/JettyUtils.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ private[spark] object JettyUtils extends Logging {
7878
} catch {
7979
case e: IllegalArgumentException =>
8080
response.sendError(HttpServletResponse.SC_BAD_REQUEST, e.getMessage)
81+
case e: Exception =>
82+
logWarning(s"GET ${request.getRequestURI} failed: $e", e)
83+
throw e
8184
}
8285
}
8386
// SPARK-5983 ensure TRACE is not supported
@@ -217,6 +220,9 @@ private[spark] object JettyUtils extends Logging {
217220
val pool = new QueuedThreadPool
218221
pool.setDaemon(true)
219222
server.setThreadPool(pool)
223+
val errorHandler = new ErrorHandler()
224+
errorHandler.setShowStacks(true)
225+
server.addBean(errorHandler)
220226
server.setHandler(collection)
221227
try {
222228
server.start()

docs/running-on-yarn.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,15 @@ Most of the configs are the same for Spark on YARN as for other deployment modes
220220
Otherwise, the client process will exit after submission.
221221
</td>
222222
</tr>
223+
<tr>
224+
<td><code>spark.yarn.executor.nodeLabelExpression</code></td>
225+
<td>(none)</td>
226+
<td>
227+
A YARN node label expression that restricts the set of nodes executors will be scheduled on.
228+
Only versions of YARN greater than or equal to 2.6 support node label expressions, so when
229+
running against earlier versions, this property will be ignored.
230+
</td>
231+
</tr>
223232
</table>
224233

225234
# Launching Spark on YARN

docs/sql-programming-guide.md

Lines changed: 70 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -367,11 +367,18 @@ val people = sc.textFile("examples/src/main/resources/people.txt").map(_.split("
367367
people.registerTempTable("people")
368368

369369
// SQL statements can be run by using the sql methods provided by sqlContext.
370-
val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
370+
val teenagers = sqlContext.sql("SELECT name, age FROM people WHERE age >= 13 AND age <= 19")
371371

372372
// The results of SQL queries are DataFrames and support all the normal RDD operations.
373-
// The columns of a row in the result can be accessed by ordinal.
373+
// The columns of a row in the result can be accessed by field index:
374374
teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
375+
376+
// or by field name:
377+
teenagers.map(t => "Name: " + t.getAs[String]("name")).collect().foreach(println)
378+
379+
// row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T]
380+
teenagers.map(_.getValuesMap[Any](List("name", "age"))).collect().foreach(println)
381+
// Map("name" -> "Justin", "age" -> 19)
375382
{% endhighlight %}
376383

377384
</div>
@@ -470,7 +477,7 @@ parts = lines.map(lambda l: l.split(","))
470477
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
471478

472479
# Infer the schema, and register the DataFrame as a table.
473-
schemaPeople = sqlContext.inferSchema(people)
480+
schemaPeople = sqlContext.createDataFrame(people)
474481
schemaPeople.registerTempTable("people")
475482

476483
# SQL can be run over DataFrames that have been registered as a table.
@@ -538,7 +545,7 @@ peopleDataFrame.registerTempTable("people")
538545
val results = sqlContext.sql("SELECT name FROM people")
539546

540547
// The results of SQL queries are DataFrames and support all the normal RDD operations.
541-
// The columns of a row in the result can be accessed by ordinal.
548+
// The columns of a row in the result can be accessed by field index or by field name.
542549
results.map(t => "Name: " + t(0)).collect().foreach(println)
543550
{% endhighlight %}
544551

@@ -1594,6 +1601,64 @@ options.
15941601

15951602
# Migration Guide
15961603

1604+
## Upgrading from Spark SQL 1.3 to 1.4
1605+
1606+
Based on user feedback, we changed the default behavior of `DataFrame.groupBy().agg()` to retain the grouping columns in the resulting `DataFrame`. To keep the behavior in 1.3, set `spark.sql.retainGroupColumns` to `false`.
1607+
1608+
<div class="codetabs">
1609+
<div data-lang="scala" markdown="1">
1610+
{% highlight scala %}
1611+
1612+
// In 1.3.x, in order for the grouping column "department" to show up,
1613+
// it must be included explicitly as part of the agg function call.
1614+
df.groupBy("department").agg($"department", max("age"), sum("expense"))
1615+
1616+
// In 1.4+, grouping column "department" is included automatically.
1617+
df.groupBy("department").agg(max("age"), sum("expense"))
1618+
1619+
// Revert to 1.3 behavior (not retaining grouping column) by:
1620+
sqlContext.setConf("spark.sql.retainGroupColumns", "false")
1621+
1622+
{% endhighlight %}
1623+
</div>
1624+
1625+
<div data-lang="java" markdown="1">
1626+
{% highlight java %}
1627+
1628+
// In 1.3.x, in order for the grouping column "department" to show up,
1629+
// it must be included explicitly as part of the agg function call.
1630+
df.groupBy("department").agg(col("department"), max("age"), sum("expense"));
1631+
1632+
// In 1.4+, grouping column "department" is included automatically.
1633+
df.groupBy("department").agg(max("age"), sum("expense"));
1634+
1635+
// Revert to 1.3 behavior (not retaining grouping column) by:
1636+
sqlContext.setConf("spark.sql.retainGroupColumns", "false");
1637+
1638+
{% endhighlight %}
1639+
</div>
1640+
1641+
<div data-lang="python" markdown="1">
1642+
{% highlight python %}
1643+
1644+
import pyspark.sql.functions as func
1645+
1646+
# In 1.3.x, in order for the grouping column "department" to show up,
1647+
# it must be included explicitly as part of the agg function call.
1648+
df.groupBy("department").agg("department"), func.max("age"), func.sum("expense"))
1649+
1650+
# In 1.4+, grouping column "department" is included automatically.
1651+
df.groupBy("department").agg(func.max("age"), func.sum("expense"))
1652+
1653+
# Revert to 1.3.x behavior (not retaining grouping column) by:
1654+
sqlContext.setConf("spark.sql.retainGroupColumns", "false")
1655+
1656+
{% endhighlight %}
1657+
</div>
1658+
1659+
</div>
1660+
1661+
15971662
## Upgrading from Spark SQL 1.0-1.2 to 1.3
15981663

15991664
In Spark 1.3 we removed the "Alpha" label from Spark SQL and as part of this did a cleanup of the
@@ -1651,7 +1716,7 @@ moved into the udf object in `SQLContext`.
16511716

16521717
<div class="codetabs">
16531718
<div data-lang="scala" markdown="1">
1654-
{% highlight java %}
1719+
{% highlight scala %}
16551720

16561721
sqlContext.udf.register("strLen", (s: String) => s.length())
16571722

docs/submitting-applications.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ for applications that involve the REPL (e.g. Spark shell).
5959
Alternatively, if your application is submitted from a machine far from the worker machines (e.g.
6060
locally on your laptop), it is common to use `cluster` mode to minimize network latency between
6161
the drivers and the executors. Note that `cluster` mode is currently not supported for
62-
Mesos clusters or Python applications.
62+
Mesos clusters. Currently only YARN supports cluster mode for Python applications.
6363

6464
For Python applications, simply pass a `.py` file in the place of `<application-jar>` instead of a JAR,
6565
and add Python `.zip`, `.egg` or `.py` files to the search path with `--py-files`.

examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeExample.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ object DecisionTreeExample {
112112
.text(s"input path to test dataset. If given, option fracTest is ignored." +
113113
s" default: ${defaultParams.testInput}")
114114
.action((x, c) => c.copy(testInput = x))
115-
opt[String]("<dataFormat>")
115+
opt[String]("dataFormat")
116116
.text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
117117
.action((x, c) => c.copy(dataFormat = x))
118118
arg[String]("<input>")

examples/src/main/scala/org/apache/spark/examples/ml/GBTExample.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ object GBTExample {
111111
.text(s"input path to test dataset. If given, option fracTest is ignored." +
112112
s" default: ${defaultParams.testInput}")
113113
.action((x, c) => c.copy(testInput = x))
114-
opt[String]("<dataFormat>")
114+
opt[String]("dataFormat")
115115
.text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
116116
.action((x, c) => c.copy(dataFormat = x))
117117
arg[String]("<input>")

0 commit comments

Comments
 (0)