@@ -2271,12 +2271,13 @@ setMethod("dropDuplicates",
22712271
22722272# ' Join
22732273# '
2274- # ' Join two SparkDataFrames based on the given join expression.
2274+ # ' Joins two SparkDataFrames based on the given join expression.
22752275# '
22762276# ' @param x A SparkDataFrame
22772277# ' @param y A SparkDataFrame
22782278# ' @param joinExpr (Optional) The expression used to perform the join. joinExpr must be a
2279- # ' Column expression. If joinExpr is omitted, join() will perform a Cartesian join
2279+ # ' Column expression. If joinExpr is omitted, the default, inner join is attempted and an error is
2280+ # ' thrown if it would be a Cartesian Product. For Cartesian join, use crossJoin instead.
22802281# ' @param joinType The type of join to perform. The following join types are available:
22812282# ' 'inner', 'outer', 'full', 'fullouter', leftouter', 'left_outer', 'left',
22822283# ' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner".
@@ -2285,23 +2286,24 @@ setMethod("dropDuplicates",
22852286# ' @aliases join,SparkDataFrame,SparkDataFrame-method
22862287# ' @rdname join
22872288# ' @name join
2288- # ' @seealso \link{merge}
2289+ # ' @seealso \link{merge} \link{crossJoin}
22892290# ' @export
22902291# ' @examples
22912292# '\dontrun{
22922293# ' sparkR.session()
22932294# ' df1 <- read.json(path)
22942295# ' df2 <- read.json(path2)
2295- # ' join(df1, df2) # Performs a Cartesian
22962296# ' join(df1, df2, df1$col1 == df2$col2) # Performs an inner join based on expression
22972297# ' join(df1, df2, df1$col1 == df2$col2, "right_outer")
2298+ # ' join(df1, df2) # Attempts an inner join
22982299# ' }
22992300# ' @note join since 1.4.0
23002301setMethod ("join ",
23012302 signature(x = " SparkDataFrame" , y = " SparkDataFrame" ),
23022303 function (x , y , joinExpr = NULL , joinType = NULL ) {
23032304 if (is.null(joinExpr )) {
2304- sdf <- callJMethod(x @ sdf , " crossJoin" , y @ sdf )
2305+ # this may not fail until the planner checks for Cartesian join later on.
2306+ sdf <- callJMethod(x @ sdf , " join" , y @ sdf )
23052307 } else {
23062308 if (class(joinExpr ) != " Column" ) stop(" joinExpr must be a Column" )
23072309 if (is.null(joinType )) {
@@ -2322,22 +2324,52 @@ setMethod("join",
23222324 dataFrame(sdf )
23232325 })
23242326
2327+ # ' CrossJoin
2328+ # '
2329+ # ' Returns Cartesian Product on two SparkDataFrames.
2330+ # '
2331+ # ' @param x A SparkDataFrame
2332+ # ' @param y A SparkDataFrame
2333+ # ' @return A SparkDataFrame containing the result of the join operation.
2334+ # ' @family SparkDataFrame functions
2335+ # ' @aliases crossJoin,SparkDataFrame,SparkDataFrame-method
2336+ # ' @rdname crossJoin
2337+ # ' @name crossJoin
2338+ # ' @seealso \link{merge} \link{join}
2339+ # ' @export
2340+ # ' @examples
2341+ # '\dontrun{
2342+ # ' sparkR.session()
2343+ # ' df1 <- read.json(path)
2344+ # ' df2 <- read.json(path2)
2345+ # ' crossJoin(df1, df2) # Performs a Cartesian
2346+ # ' }
2347+ # ' @note crossJoin since 2.1.0
2348+ setMethod ("crossJoin ",
2349+ signature(x = " SparkDataFrame" , y = " SparkDataFrame" ),
2350+ function (x , y ) {
2351+ sdf <- callJMethod(x @ sdf , " crossJoin" , y @ sdf )
2352+ dataFrame(sdf )
2353+ })
2354+
23252355# ' Merges two data frames
23262356# '
23272357# ' @name merge
2328- # ' @param x the first data frame to be joined
2329- # ' @param y the second data frame to be joined
2358+ # ' @param x the first data frame to be joined.
2359+ # ' @param y the second data frame to be joined.
23302360# ' @param by a character vector specifying the join columns. If by is not
23312361# ' specified, the common column names in \code{x} and \code{y} will be used.
2362+ # ' If by or both by.x and by.y are explicitly set to NULL or of length 0, the Cartesian
2363+ # ' Product of x and y will be returned.
23322364# ' @param by.x a character vector specifying the joining columns for x.
23332365# ' @param by.y a character vector specifying the joining columns for y.
23342366# ' @param all a boolean value setting \code{all.x} and \code{all.y}
23352367# ' if any of them are unset.
23362368# ' @param all.x a boolean value indicating whether all the rows in x should
2337- # ' be including in the join
2369+ # ' be including in the join.
23382370# ' @param all.y a boolean value indicating whether all the rows in y should
2339- # ' be including in the join
2340- # ' @param sort a logical argument indicating whether the resulting columns should be sorted
2371+ # ' be including in the join.
2372+ # ' @param sort a logical argument indicating whether the resulting columns should be sorted.
23412373# ' @param suffixes a string vector of length 2 used to make colnames of
23422374# ' \code{x} and \code{y} unique.
23432375# ' The first element is appended to each colname of \code{x}.
@@ -2351,20 +2383,21 @@ setMethod("join",
23512383# ' @family SparkDataFrame functions
23522384# ' @aliases merge,SparkDataFrame,SparkDataFrame-method
23532385# ' @rdname merge
2354- # ' @seealso \link{join}
2386+ # ' @seealso \link{join} \link{crossJoin}
23552387# ' @export
23562388# ' @examples
23572389# '\dontrun{
23582390# ' sparkR.session()
23592391# ' df1 <- read.json(path)
23602392# ' df2 <- read.json(path2)
2361- # ' merge(df1, df2) # Performs a Cartesian
2393+ # ' merge(df1, df2) # Performs an inner join by common columns
23622394# ' merge(df1, df2, by = "col1") # Performs an inner join based on expression
23632395# ' merge(df1, df2, by.x = "col1", by.y = "col2", all.y = TRUE)
23642396# ' merge(df1, df2, by.x = "col1", by.y = "col2", all.x = TRUE)
23652397# ' merge(df1, df2, by.x = "col1", by.y = "col2", all.x = TRUE, all.y = TRUE)
23662398# ' merge(df1, df2, by.x = "col1", by.y = "col2", all = TRUE, sort = FALSE)
23672399# ' merge(df1, df2, by = "col1", all = TRUE, suffixes = c("-X", "-Y"))
2400+ # ' merge(df1, df2, by = NULL) # Performs a Cartesian join
23682401# ' }
23692402# ' @note merge since 1.5.0
23702403setMethod ("merge ",
@@ -2401,7 +2434,7 @@ setMethod("merge",
24012434 joinY <- by
24022435 } else {
24032436 # if by or both by.x and by.y have length 0, use Cartesian Product
2404- joinRes <- join (x , y )
2437+ joinRes <- crossJoin (x , y )
24052438 return (joinRes )
24062439 }
24072440
0 commit comments