Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ exportMethods("%<=>%",
"acos",
"add_months",
"alias",
"approx_count_distinct",
"approxCountDistinct",
"approxQuantile",
"array_contains",
Expand Down Expand Up @@ -252,6 +253,7 @@ exportMethods("%<=>%",
"dayofweek",
"dayofyear",
"decode",
"degrees",
"dense_rank",
"desc",
"element_at",
Expand Down Expand Up @@ -334,6 +336,7 @@ exportMethods("%<=>%",
"posexplode",
"posexplode_outer",
"quarter",
"radians",
"rand",
"randn",
"rank",
Expand Down
73 changes: 64 additions & 9 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ NULL
#' df <- createDataFrame(cbind(model = rownames(mtcars), mtcars))
#' tmp <- mutate(df, v1 = log(df$mpg), v2 = cbrt(df$disp),
#' v3 = bround(df$wt, 1), v4 = bin(df$cyl),
#' v5 = hex(df$wt), v6 = toDegrees(df$gear),
#' v5 = hex(df$wt), v6 = degrees(df$gear),
#' v7 = atan2(df$cyl, df$am), v8 = hypot(df$cyl, df$am),
#' v9 = pmod(df$hp, df$cyl), v10 = shiftLeft(df$disp, 1),
#' v11 = conv(df$hp, 10, 16), v12 = sign(df$vs - 0.5),
Expand Down Expand Up @@ -320,23 +320,37 @@ setMethod("acos",
})

#' @details
#' \code{approxCountDistinct}: Returns the approximate number of distinct items in a group.
#' \code{approx_count_distinct}: Returns the approximate number of distinct items in a group.
#'
#' @rdname column_aggregate_functions
#' @aliases approxCountDistinct approxCountDistinct,Column-method
#' @aliases approx_count_distinct approx_count_distinct,Column-method
#' @examples
#'
#' \dontrun{
#' head(select(df, approxCountDistinct(df$gear)))
#' head(select(df, approxCountDistinct(df$gear, 0.02)))
#' head(select(df, approx_count_distinct(df$gear)))
#' head(select(df, approx_count_distinct(df$gear, 0.02)))
#' head(select(df, countDistinct(df$gear, df$cyl)))
#' head(select(df, n_distinct(df$gear)))
#' head(distinct(select(df, "gear")))}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we only need one set - they both are @rdname column_aggregate_functions so will duplicate all other examples

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, @HyukjinKwon fixed this. Pending tests, does the change look OK to you on the R side @felixcheung ?

#' @note approx_count_distinct(Column) since 3.0.0
setMethod("approx_count_distinct",
signature(x = "Column"),
function(x) {
jc <- callJStatic("org.apache.spark.sql.functions", "approx_count_distinct", x@jc)
column(jc)
})

#' @details
#' \code{approxCountDistinct}: Returns the approximate number of distinct items in a group.
#'
#' @rdname column_aggregate_functions
#' @aliases approxCountDistinct approxCountDistinct,Column-method
#' @note approxCountDistinct(Column) since 1.4.0
setMethod("approxCountDistinct",
signature(x = "Column"),
function(x) {
jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc)
.Deprecated("approx_count_distinct")
jc <- callJStatic("org.apache.spark.sql.functions", "approx_count_distinct", x@jc)
column(jc)
})

Expand Down Expand Up @@ -1651,7 +1665,22 @@ setMethod("tanh",
setMethod("toDegrees",
signature(x = "Column"),
function(x) {
jc <- callJStatic("org.apache.spark.sql.functions", "toDegrees", x@jc)
.Deprecated("degrees")
jc <- callJStatic("org.apache.spark.sql.functions", "degrees", x@jc)
column(jc)
})

#' @details
#' \code{degrees}: Converts an angle measured in radians to an approximately equivalent angle
#' measured in degrees.
#'
#' @rdname column_math_functions
#' @aliases degrees degrees,Column-method
#' @note degrees since 3.0.0
setMethod("degrees",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

degrees and radians will need to be added to NAMESPACE file for export

signature(x = "Column"),
function(x) {
jc <- callJStatic("org.apache.spark.sql.functions", "degrees", x@jc)
column(jc)
})

Expand All @@ -1665,7 +1694,22 @@ setMethod("toDegrees",
setMethod("toRadians",
signature(x = "Column"),
function(x) {
jc <- callJStatic("org.apache.spark.sql.functions", "toRadians", x@jc)
.Deprecated("radians")
jc <- callJStatic("org.apache.spark.sql.functions", "radians", x@jc)
column(jc)
})

#' @details
#' \code{radians}: Converts an angle measured in degrees to an approximately equivalent angle
#' measured in radians.
#'
#' @rdname column_math_functions
#' @aliases radians radians,Column-method
#' @note radians since 3.0.0
setMethod("radians",
signature(x = "Column"),
function(x) {
jc <- callJStatic("org.apache.spark.sql.functions", "radians", x@jc)
column(jc)
})

Expand Down Expand Up @@ -2065,13 +2109,24 @@ setMethod("pmod", signature(y = "Column"),

#' @param rsd maximum estimation error allowed (default = 0.05).
#'
#' @rdname column_aggregate_functions
#' @aliases approx_count_distinct,Column-method
#' @note approx_count_distinct(Column, numeric) since 3.0.0
setMethod("approx_count_distinct",
signature(x = "Column"),
function(x, rsd = 0.05) {
jc <- callJStatic("org.apache.spark.sql.functions", "approx_count_distinct", x@jc, rsd)
column(jc)
})

#' @rdname column_aggregate_functions
#' @aliases approxCountDistinct,Column-method
#' @note approxCountDistinct(Column, numeric) since 1.4.0
setMethod("approxCountDistinct",
signature(x = "Column"),
function(x, rsd = 0.05) {
jc <- callJStatic("org.apache.spark.sql.functions", "approxCountDistinct", x@jc, rsd)
.Deprecated("approx_count_distinct")
jc <- callJStatic("org.apache.spark.sql.functions", "approx_count_distinct", x@jc, rsd)
column(jc)
})

Expand Down
12 changes: 12 additions & 0 deletions R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,10 @@ setGeneric("windowOrderBy", function(col, ...) { standardGeneric("windowOrderBy"
#' @name NULL
setGeneric("add_months", function(y, x) { standardGeneric("add_months") })

#' @rdname column_aggregate_functions
#' @name NULL
setGeneric("approx_count_distinct", function(x, ...) { standardGeneric("approx_count_distinct") })
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@felixcheung might want to check if I'm handling these R changes correctly

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my concern is that these are breaking changes in a version without having them deprecated first...
could we leave the old one to redirect and add .Deprecate?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think my comment didn't get connected to this one -- @felixcheung what do you think about the argument that this almost surely was meant to be deprecated along with counterparts in Scala/Python? leaving them in would make this inconsistent. As the degrees, radians, and approxCountDistinct are reasonably niche and have a direct replacement that's compatible with older versions, I feel like this is OK for 3.0?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's super light weight to have a approxCountDistinct that calls approx_count_distinct with deprecation?
I thought was that R API was not always sync or complete compare to python, and a breaking API change - ie. the job will fail - seems a bit drastic even in a major release.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is, but then again that's exactly what was deprecated and removed in Python and Scala. Major versions can have breaking changes. Yes R isn't always in sync but that's a bug not a feature? Let me surface this to dev@ as I think it's going to come up a few more times.


#' @rdname column_aggregate_functions
#' @name NULL
setGeneric("approxCountDistinct", function(x, ...) { standardGeneric("approxCountDistinct") })
Expand Down Expand Up @@ -1287,10 +1291,18 @@ setGeneric("substring_index", function(x, delim, count) { standardGeneric("subst
#' @name NULL
setGeneric("sumDistinct", function(x) { standardGeneric("sumDistinct") })

#' @rdname column_math_functions
#' @name NULL
setGeneric("degrees", function(x) { standardGeneric("degrees") })

#' @rdname column_math_functions
#' @name NULL
setGeneric("toDegrees", function(x) { standardGeneric("toDegrees") })

#' @rdname column_math_functions
#' @name NULL
setGeneric("radians", function(x) { standardGeneric("radians") })

#' @rdname column_math_functions
#' @name NULL
setGeneric("toRadians", function(x) { standardGeneric("toRadians") })
Expand Down
4 changes: 2 additions & 2 deletions R/pkg/tests/fulltests/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1379,7 +1379,7 @@ test_that("column operators", {

test_that("column functions", {
c <- column("a")
c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c)
c1 <- abs(c) + acos(c) + approx_count_distinct(c) + ascii(c) + asin(c) + atan(c)
c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
c3 <- cosh(c) + count(c) + crc32(c) + hash(c) + exp(c)
c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
Expand All @@ -1388,7 +1388,7 @@ test_that("column functions", {
c7 <- mean(c) + min(c) + month(c) + negate(c) + posexplode(c) + quarter(c)
c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) + monotonically_increasing_id()
c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c)
c10 <- sumDistinct(c) + tan(c) + tanh(c) + degrees(c) + radians(c)
c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
c12 <- variance(c) + ltrim(c, "a") + rtrim(c, "b") + trim(c, "c")
c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
Expand Down
2 changes: 0 additions & 2 deletions core/src/main/scala/org/apache/spark/BarrierTaskContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,6 @@ class BarrierTaskContext private[spark] (

override def isInterrupted(): Boolean = taskContext.isInterrupted()

override def isRunningLocally(): Boolean = taskContext.isRunningLocally()

override def addTaskCompletionListener(listener: TaskCompletionListener): this.type = {
taskContext.addTaskCompletionListener(listener)
this
Expand Down
7 changes: 0 additions & 7 deletions core/src/main/scala/org/apache/spark/TaskContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,6 @@ abstract class TaskContext extends Serializable {
*/
def isInterrupted(): Boolean

/**
* Returns true if the task is running locally in the driver program.
* @return false
*/
@deprecated("Local execution was removed, so this always returns false", "2.0.0")
def isRunningLocally(): Boolean

/**
* Adds a (Java friendly) listener to be executed on task completion.
* This will be called in all situations - success, failure, or cancellation. Adding a listener
Expand Down
2 changes: 0 additions & 2 deletions core/src/main/scala/org/apache/spark/TaskContextImpl.scala
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,6 @@ private[spark] class TaskContextImpl(
@GuardedBy("this")
override def isCompleted(): Boolean = synchronized(completed)

override def isRunningLocally(): Boolean = false

override def isInterrupted(): Boolean = reasonIfKilled.isDefined

override def getLocalProperty(key: String): String = localProperties.getProperty(key)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,4 @@ class ShuffleWriteMetrics private[spark] () extends Serializable {
private[spark] def decRecordsWritten(v: Long): Unit = {
_recordsWritten.setValue(recordsWritten - v)
}

// Legacy methods for backward compatibility.
// TODO: remove these once we make this class private.
@deprecated("use bytesWritten instead", "2.0.0")
def shuffleBytesWritten: Long = bytesWritten
@deprecated("use writeTime instead", "2.0.0")
def shuffleWriteTime: Long = writeTime
@deprecated("use recordsWritten instead", "2.0.0")
def shuffleRecordsWritten: Long = recordsWritten

}
Original file line number Diff line number Diff line change
Expand Up @@ -47,33 +47,3 @@ case class AccumulableInfo private[spark] (
private[spark] val countFailedValues: Boolean,
// TODO: use this to identify internal task metrics instead of encoding it in the name
private[spark] val metadata: Option[String] = None)


/**
* A collection of deprecated constructors. This will be removed soon.
*/
object AccumulableInfo {

@deprecated("do not create AccumulableInfo", "2.0.0")
def apply(
id: Long,
name: String,
update: Option[String],
value: String,
internal: Boolean): AccumulableInfo = {
new AccumulableInfo(
id, Option(name), update, Option(value), internal, countFailedValues = false)
}

@deprecated("do not create AccumulableInfo", "2.0.0")
def apply(id: Long, name: String, update: Option[String], value: String): AccumulableInfo = {
new AccumulableInfo(
id, Option(name), update, Option(value), internal = false, countFailedValues = false)
}

@deprecated("do not create AccumulableInfo", "2.0.0")
def apply(id: Long, name: String, value: String): AccumulableInfo = {
new AccumulableInfo(
id, Option(name), None, Option(value), internal = false, countFailedValues = false)
}
}
22 changes: 0 additions & 22 deletions mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,6 @@ import org.apache.spark.util.{Utils, VersionUtils}
private[util] sealed trait BaseReadWrite {
private var optionSparkSession: Option[SparkSession] = None

/**
* Sets the Spark SQLContext to use for saving/loading.
*
* @deprecated Use session instead. This method will be removed in 3.0.0.
*/
@Since("1.6.0")
@deprecated("Use session instead. This method will be removed in 3.0.0.", "2.0.0")
def context(sqlContext: SQLContext): this.type = {
optionSparkSession = Option(sqlContext.sparkSession)
this
}

/**
* Sets the Spark Session to use for saving/loading.
*/
Expand Down Expand Up @@ -215,10 +203,6 @@ abstract class MLWriter extends BaseReadWrite with Logging {
// override for Java compatibility
@Since("1.6.0")
override def session(sparkSession: SparkSession): this.type = super.session(sparkSession)

// override for Java compatibility
@Since("1.6.0")
override def context(sqlContext: SQLContext): this.type = super.session(sqlContext.sparkSession)
}

/**
Expand Down Expand Up @@ -281,9 +265,6 @@ class GeneralMLWriter(stage: PipelineStage) extends MLWriter with Logging {

// override for Java compatibility
override def session(sparkSession: SparkSession): this.type = super.session(sparkSession)

// override for Java compatibility
override def context(sqlContext: SQLContext): this.type = super.session(sqlContext.sparkSession)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@srowen This public method seems had not been deprecated before removal, and is avaiable in 2.4.5.

scala> import org.apache.spark.ml.util.GeneralMLWriter
import org.apache.spark.ml.util.GeneralMLWriter

scala> new GeneralMLWriter(null).context(spark.sqlContext)
res0: org.apache.spark.ml.util.GeneralMLWriter = org.apache.spark.ml.util.GeneralMLWriter@26b150cd

There is no deprecation warning above. Does it matter?

Copy link
Member

@HyukjinKwon HyukjinKwon Feb 26, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems properly deprecated in MLWriter as its parent.
https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.util.GeneralMLWriter@context(sqlContext:org.apache.spark.sql.SQLContext):GeneralMLWriter.this.type
and the Scaladoc explicitly shows this at GeneralMLWriter too. Seems right to remove together if we should in MLWriter.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it was deprecated in 2.0.0 and marked for removal in 3.0.0.

 /**	
   * Sets the Spark SQLContext to use for saving/loading.	
   *	
   * @deprecated Use session instead. This method will be removed in 3.0.0.	
   */	
  @Since("1.6.0")	
  @deprecated("Use session instead. This method will be removed in 3.0.0.", "2.0.0")

I think ideally the Java overload and subclass overrides would be marked deprecated too, but they implicitly are. If there were a case that this is actually used, we could revive it, but just wondering how often people would be using save + SQLContext?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I never use this method, just check it. Thanks!

}

/**
Expand Down Expand Up @@ -352,9 +333,6 @@ abstract class MLReader[T] extends BaseReadWrite {

// override for Java compatibility
override def session(sparkSession: SparkSession): this.type = super.session(sparkSession)

// override for Java compatibility
override def context(sqlContext: SQLContext): this.type = super.session(sqlContext.sparkSession)
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,31 +134,6 @@ class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Doubl
@Since("1.1.0")
def fMeasure(label: Double): Double = fMeasure(label, 1.0)

/**
* Returns precision
*/
@Since("1.1.0")
@deprecated("Use accuracy.", "2.0.0")
lazy val precision: Double = accuracy

/**
* Returns recall
* (equals to precision for multiclass classifier
* because sum of all false positives is equal to sum
* of all false negatives)
*/
@Since("1.1.0")
@deprecated("Use accuracy.", "2.0.0")
lazy val recall: Double = accuracy

/**
* Returns f-measure
* (equals to precision and recall because precision equals recall)
*/
@Since("1.1.0")
@deprecated("Use accuracy.", "2.0.0")
lazy val fMeasure: Double = accuracy

/**
* Returns accuracy
* (equals to the total number of correctly classified instances
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,6 @@ class MulticlassMetricsSuite extends SparkFunSuite with MLlibTestSparkContext {

assert(math.abs(metrics.accuracy -
(2.0 + 3.0 + 1.0) / ((2 + 3 + 1) + (1 + 1 + 1))) < delta)
assert(math.abs(metrics.accuracy - metrics.precision) < delta)
assert(math.abs(metrics.accuracy - metrics.recall) < delta)
assert(math.abs(metrics.accuracy - metrics.fMeasure) < delta)
assert(math.abs(metrics.accuracy - metrics.weightedRecall) < delta)
assert(math.abs(metrics.weightedTruePositiveRate -
((4.0 / 9) * tpRate0 + (4.0 / 9) * tpRate1 + (1.0 / 9) * tpRate2)) < delta)
Expand Down
18 changes: 18 additions & 0 deletions project/MimaExcludes.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,24 @@ object MimaExcludes {

// Exclude rules for 3.0.x
lazy val v30excludes = v24excludes ++ Seq(
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.TaskContext.isRunningLocally"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.executor.ShuffleWriteMetrics.shuffleBytesWritten"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.executor.ShuffleWriteMetrics.shuffleWriteTime"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.executor.ShuffleWriteMetrics.shuffleRecordsWritten"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.AccumulableInfo.apply"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.functions.approxCountDistinct"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.functions.toRadians"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.functions.toDegrees"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.functions.monotonicallyIncreasingId"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.SQLContext.clearActive"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.SQLContext.getOrCreate"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.SQLContext.setActive"),
ProblemFilters.exclude[IncompatibleMethTypeProblem]("org.apache.spark.sql.SQLContext.this"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.evaluation.MulticlassMetrics.fMeasure"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.evaluation.MulticlassMetrics.recall"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.mllib.evaluation.MulticlassMetrics.precision"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.util.MLWriter.context"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.util.MLReader.context"),
// [SPARK-25737] Remove JavaSparkContextVarargsWorkaround
ProblemFilters.exclude[MissingTypesProblem]("org.apache.spark.api.java.JavaSparkContext"),
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.api.java.JavaSparkContext.union"),
Expand Down
Loading