Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.plans.{Inner, JoinType}
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, SqlParser}
import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, FileRelation, LogicalRDD, QueryExecution, Queryable, SQLExecution}
import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation}
Expand Down Expand Up @@ -499,10 +499,8 @@ class DataFrame private[sql](
// Analyze the self join. The assumption is that the analyzer will disambiguate left vs right
// by creating a new instance for one of the branch.
val joined = sqlContext.executePlan(
Join(logicalPlan, right.logicalPlan, joinType = Inner, None)).analyzed.asInstanceOf[Join]
Join(logicalPlan, right.logicalPlan, JoinType(joinType), None)).analyzed.asInstanceOf[Join]

// Project only one of the join columns.
val joinedCols = usingColumns.map(col => withPlan(joined.right).resolve(col))
val condition = usingColumns.map { col =>
catalyst.expressions.EqualTo(
withPlan(joined.left).resolve(col),
Expand All @@ -511,9 +509,26 @@ class DataFrame private[sql](
catalyst.expressions.And(cond, eqTo)
}

// Project only one of the join columns.
val joinedCols = JoinType(joinType) match {
case Inner | LeftOuter | LeftSemi =>
usingColumns.map(col => withPlan(joined.left).resolve(col))
case RightOuter =>
usingColumns.map(col => withPlan(joined.right).resolve(col))
case FullOuter =>
usingColumns.map { col =>
val leftCol = withPlan(joined.left).resolve(col)
val rightCol = withPlan(joined.right).resolve(col)
Alias(Coalesce(Seq(leftCol, rightCol)), col)()
}
}
// The nullability of output of joined could be different than original column,
// so we can only compare them by exprId
val joinRefs = condition.map(_.references.toSeq.map(_.exprId)).getOrElse(Nil)
val resultCols = joinedCols ++ joined.output.filterNot(e => joinRefs.contains(e.exprId))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Here we can use an AttributeSet for convenience:

val joinRefs = AttributeSet(condition.toSeq.flatMap(_.references))
val resultCols = joinedCols ++ joined.output.filterNot(joinRefs.contains)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe not, as the comment said, we can't compare Attribute here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but that is exactly what an AttributeSet is for. It is a set that only compares the id.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@liancheng @marmbrus Oh, I see, missed that, thanks!

withPlan {
Project(
joined.output.filterNot(joinedCols.contains(_)),
resultCols,
Join(
joined.left,
joined.right,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,28 @@ class DataFrameJoinSuite extends QueryTest with SharedSQLContext {
}

test("join - join using multiple columns and specifying join type") {
val df = Seq(1, 2, 3).map(i => (i, i + 1, i.toString)).toDF("int", "int2", "str")
val df2 = Seq(1, 2, 3).map(i => (i, i + 1, (i + 1).toString)).toDF("int", "int2", "str")
val df = Seq((1, 2, "1"), (3, 4, "3")).toDF("int", "int2", "str")
val df2 = Seq((1, 3, "1"), (5, 6, "5")).toDF("int", "int2", "str")

checkAnswer(
df.join(df2, Seq("int", "str"), "inner"),
Row(1, "1", 2, 3) :: Nil)

checkAnswer(
df.join(df2, Seq("int", "str"), "left"),
Row(1, 2, "1", null) :: Row(2, 3, "2", null) :: Row(3, 4, "3", null) :: Nil)
Row(1, "1", 2, 3) :: Row(3, "3", 4, null) :: Nil)

checkAnswer(
df.join(df2, Seq("int", "str"), "right"),
Row(null, null, null, 2) :: Row(null, null, null, 3) :: Row(null, null, null, 4) :: Nil)
Row(1, "1", 2, 3) :: Row(5, "5", null, 6) :: Nil)

checkAnswer(
df.join(df2, Seq("int", "str"), "outer"),
Row(1, "1", 2, 3) :: Row(3, "3", 4, null) :: Row(5, "5", null, 6) :: Nil)

checkAnswer(
df.join(df2, Seq("int", "str"), "left_semi"),
Row(1, "1", 2) :: Nil)
}

test("join - join using self join") {
Expand Down