@@ -3661,7 +3661,8 @@ setMethod("getNumPartitions",
36613661# ' isStreaming
36623662# '
36633663# ' Returns TRUE if this SparkDataFrame contains one or more sources that continuously return data
3664- # ' as it arrives.
3664+ # ' as it arrives. A dataset that reads data from a streaming source must be executed as a
3665+ # ' \code{StreamingQuery} using \code{write.stream}.
36653666# '
36663667# ' @param x A SparkDataFrame
36673668# ' @return TRUE if this SparkDataFrame is from a streaming source
@@ -3707,7 +3708,17 @@ setMethod("isStreaming",
37073708# ' @param df a streaming SparkDataFrame.
37083709# ' @param source a name for external data source.
37093710# ' @param outputMode one of 'append', 'complete', 'update'.
3710- # ' @param ... additional argument(s) passed to the method.
3711+ # ' @param partitionBy a name or a list of names of columns to partition the output by on the file
3712+ # ' system. If specified, the output is laid out on the file system similar to Hive's
3713+ # ' partitioning scheme.
3714+ # ' @param trigger.processingTime a processing time interval as a string, e.g. '5 seconds',
3715+ # ' '1 minute'. This is a trigger that runs a query periodically based on the processing
3716+ # ' time. If value is '0 seconds', the query will run as fast as possible, this is the
3717+ # ' default. Only one trigger can be set.
3718+ # ' @param trigger.once a logical, must be set to \code{TRUE}. This is a trigger that processes only
3719+ # ' one batch of data in a streaming query then terminates the query. Only one trigger can be
3720+ # ' set.
3721+ # ' @param ... additional external data source specific named options.
37113722# '
37123723# ' @family SparkDataFrame functions
37133724# ' @seealso \link{read.stream}
@@ -3725,7 +3736,8 @@ setMethod("isStreaming",
37253736# ' # console
37263737# ' q <- write.stream(wordCounts, "console", outputMode = "complete")
37273738# ' # text stream
3728- # ' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = "/home/user/cp")
3739+ # ' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = "/home/user/cp"
3740+ # ' partitionBy = c("year", "month"), trigger.processingTime = "30 seconds")
37293741# ' # memory stream
37303742# ' q <- write.stream(wordCounts, "memory", queryName = "outs", outputMode = "complete")
37313743# ' head(sql("SELECT * from outs"))
@@ -3737,7 +3749,8 @@ setMethod("isStreaming",
37373749# ' @note experimental
37383750setMethod ("write.stream ",
37393751 signature(df = " SparkDataFrame" ),
3740- function (df , source = NULL , outputMode = NULL , ... ) {
3752+ function (df , source = NULL , outputMode = NULL , partitionBy = NULL ,
3753+ trigger.processingTime = NULL , trigger.once = NULL , ... ) {
37413754 if (! is.null(source ) && ! is.character(source )) {
37423755 stop(" source should be character, NULL or omitted. It is the data source specified " ,
37433756 " in 'spark.sql.sources.default' configuration by default." )
@@ -3748,12 +3761,43 @@ setMethod("write.stream",
37483761 if (is.null(source )) {
37493762 source <- getDefaultSqlSource()
37503763 }
3764+ cols <- NULL
3765+ if (! is.null(partitionBy )) {
3766+ if (! all(sapply(partitionBy , function (c ) { is.character(c ) }))) {
3767+ stop(" All partitionBy column names should be characters." )
3768+ }
3769+ cols <- as.list(partitionBy )
3770+ }
3771+ jtrigger <- NULL
3772+ if (! is.null(trigger.processingTime ) && ! is.na(trigger.processingTime )) {
3773+ if (! is.null(trigger.once )) {
3774+ stop(" Multiple triggers not allowed." )
3775+ }
3776+ interval <- as.character(trigger.processingTime )
3777+ if (nchar(interval ) == 0 ) {
3778+ stop(" Value for trigger.processingTime must be a non-empty string." )
3779+ }
3780+ jtrigger <- handledCallJStatic(" org.apache.spark.sql.streaming.Trigger" ,
3781+ " ProcessingTime" ,
3782+ interval )
3783+ } else if (! is.null(trigger.once ) && ! is.na(trigger.once )) {
3784+ if (! is.logical(trigger.once ) || ! trigger.once ) {
3785+ stop(" Value for trigger.once must be TRUE." )
3786+ }
3787+ jtrigger <- callJStatic(" org.apache.spark.sql.streaming.Trigger" , " Once" )
3788+ }
37513789 options <- varargsToStrEnv(... )
37523790 write <- handledCallJMethod(df @ sdf , " writeStream" )
37533791 write <- callJMethod(write , " format" , source )
37543792 if (! is.null(outputMode )) {
37553793 write <- callJMethod(write , " outputMode" , outputMode )
37563794 }
3795+ if (! is.null(cols )) {
3796+ write <- callJMethod(write , " partitionBy" , cols )
3797+ }
3798+ if (! is.null(jtrigger )) {
3799+ write <- callJMethod(write , " trigger" , jtrigger )
3800+ }
37573801 write <- callJMethod(write , " options" , options )
37583802 ssq <- handledCallJMethod(write , " start" )
37593803 streamingQuery(ssq )
@@ -3967,3 +4011,47 @@ setMethod("broadcast",
39674011 sdf <- callJStatic(" org.apache.spark.sql.functions" , " broadcast" , x @ sdf )
39684012 dataFrame(sdf )
39694013 })
4014+
4015+ # ' withWatermark
4016+ # '
4017+ # ' Defines an event time watermark for this streaming SparkDataFrame. A watermark tracks a point in
4018+ # ' time before which we assume no more late data is going to arrive.
4019+ # '
4020+ # ' Spark will use this watermark for several purposes:
4021+ # ' \itemize{
4022+ # ' \item{-} To know when a given time window aggregation can be finalized and thus can be emitted
4023+ # ' when using output modes that do not allow updates.
4024+ # ' \item{-} To minimize the amount of state that we need to keep for on-going aggregations.
4025+ # ' }
4026+ # ' The current watermark is computed by looking at the \code{MAX(eventTime)} seen across
4027+ # ' all of the partitions in the query minus a user specified \code{delayThreshold}. Due to the cost
4028+ # ' of coordinating this value across partitions, the actual watermark used is only guaranteed
4029+ # ' to be at least \code{delayThreshold} behind the actual event time. In some cases we may still
4030+ # ' process records that arrive more than \code{delayThreshold} late.
4031+ # '
4032+ # ' @param x a streaming SparkDataFrame
4033+ # ' @param eventTime a string specifying the name of the Column that contains the event time of the
4034+ # ' row.
4035+ # ' @param delayThreshold a string specifying the minimum delay to wait to data to arrive late,
4036+ # ' relative to the latest record that has been processed in the form of an
4037+ # ' interval (e.g. "1 minute" or "5 hours"). NOTE: This should not be negative.
4038+ # ' @return a SparkDataFrame.
4039+ # ' @aliases withWatermark,SparkDataFrame,character,character-method
4040+ # ' @family SparkDataFrame functions
4041+ # ' @rdname withWatermark
4042+ # ' @name withWatermark
4043+ # ' @export
4044+ # ' @examples
4045+ # ' \dontrun{
4046+ # ' sparkR.session()
4047+ # ' schema <- structType(structField("time", "timestamp"), structField("value", "double"))
4048+ # ' df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
4049+ # ' df <- withWatermark(df, "time", "10 minutes")
4050+ # ' }
4051+ # ' @note withWatermark since 2.3.0
4052+ setMethod ("withWatermark ",
4053+ signature(x = " SparkDataFrame" , eventTime = " character" , delayThreshold = " character" ),
4054+ function (x , eventTime , delayThreshold ) {
4055+ sdf <- callJMethod(x @ sdf , " withWatermark" , eventTime , delayThreshold )
4056+ dataFrame(sdf )
4057+ })
0 commit comments