-
Notifications
You must be signed in to change notification settings - Fork 28.9k
[SPARK-8186][SPARK-8187][SPARK-8194][SPARK-8198][SQL] functions: date_add, date_sub, add_months, months_between, time-interval calculation #7589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c506661
1a68e03
87c4b77
42df486
e8a639a
522e91a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.InternalRow | |
| import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback | ||
| import org.apache.spark.sql.catalyst.util.DateTimeUtils | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
| import org.apache.spark.unsafe.types.{Interval, UTF8String} | ||
|
|
||
| /** | ||
| * Returns the current date at the start of query evaluation. | ||
|
|
@@ -62,6 +62,53 @@ case class CurrentTimestamp() extends LeafExpression with CodegenFallback { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Adds a number of days to startdate. | ||
| */ | ||
| case class DateAdd(startDate: Expression, days: Expression) | ||
| extends BinaryExpression with ImplicitCastInputTypes { | ||
|
|
||
| override def left: Expression = startDate | ||
| override def right: Expression = days | ||
|
|
||
| override def inputTypes: Seq[AbstractDataType] = Seq(DateType, IntegerType) | ||
|
|
||
| override def dataType: DataType = DateType | ||
|
|
||
| override def nullSafeEval(start: Any, d: Any): Any = { | ||
| start.asInstanceOf[Int] + d.asInstanceOf[Int] | ||
| } | ||
|
|
||
| override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { | ||
| nullSafeCodeGen(ctx, ev, (sd, d) => { | ||
| s"""${ev.primitive} = $sd + $d;""" | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Subtracts a number of days to startdate. | ||
| */ | ||
| case class DateSub(startDate: Expression, days: Expression) | ||
| extends BinaryExpression with ImplicitCastInputTypes { | ||
| override def left: Expression = startDate | ||
| override def right: Expression = days | ||
|
|
||
| override def inputTypes: Seq[AbstractDataType] = Seq(DateType, IntegerType) | ||
|
|
||
| override def dataType: DataType = DateType | ||
|
|
||
| override def nullSafeEval(start: Any, d: Any): Any = { | ||
| start.asInstanceOf[Int] - d.asInstanceOf[Int] | ||
| } | ||
|
|
||
| override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { | ||
| nullSafeCodeGen(ctx, ev, (sd, d) => { | ||
| s"""${ev.primitive} = $sd - $d;""" | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| case class Hour(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { | ||
|
|
||
| override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType) | ||
|
|
@@ -258,3 +305,125 @@ case class DateFormatClass(left: Expression, right: Expression) extends BinaryEx | |
| }) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Time Adds Interval. | ||
| */ | ||
| case class TimeAdd(left: Expression, right: Expression) | ||
| extends BinaryExpression with ExpectsInputTypes { | ||
|
|
||
| override def toString: String = s"$left + $right" | ||
| override def inputTypes: Seq[AbstractDataType] = | ||
| Seq(TypeCollection(DateType, TimestampType), IntervalType) | ||
|
|
||
| override def dataType: DataType = TimestampType | ||
|
|
||
| override def nullSafeEval(start: Any, interval: Any): Any = { | ||
| val itvl = interval.asInstanceOf[Interval] | ||
| left.dataType match { | ||
| case DateType => | ||
| DateTimeUtils.dateAddFullInterval( | ||
| start.asInstanceOf[Int], itvl.months, itvl.microseconds) | ||
| case TimestampType => | ||
| DateTimeUtils.timestampAddFullInterval( | ||
| start.asInstanceOf[Long], itvl.months, itvl.microseconds) | ||
| } | ||
| } | ||
|
|
||
| override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { | ||
| val dtu = DateTimeUtils.getClass.getName.stripSuffix("$") | ||
| left.dataType match { | ||
| case DateType => | ||
| defineCodeGen(ctx, ev, (sd, i) => { | ||
| s"""$dtu.dateAddFullInterval($sd, $i.months, $i.microseconds)""" | ||
| }) | ||
| case TimestampType => // TimestampType | ||
| defineCodeGen(ctx, ev, (sd, i) => { | ||
| s"""$dtu.timestampAddFullInterval($sd, $i.months, $i.microseconds)""" | ||
| }) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Time Subtracts Interval. | ||
| */ | ||
| case class TimeSub(left: Expression, right: Expression) | ||
| extends BinaryExpression with ExpectsInputTypes { | ||
|
|
||
| override def toString: String = s"$left - $right" | ||
| override def inputTypes: Seq[AbstractDataType] = | ||
| Seq(TypeCollection(DateType, TimestampType), IntervalType) | ||
|
|
||
| override def dataType: DataType = TimestampType | ||
|
|
||
| override def nullSafeEval(start: Any, interval: Any): Any = { | ||
| val itvl = interval.asInstanceOf[Interval] | ||
| left.dataType match { | ||
| case DateType => | ||
| DateTimeUtils.dateAddFullInterval( | ||
| start.asInstanceOf[Int], 0 - itvl.months, 0 - itvl.microseconds) | ||
| case TimestampType => | ||
| DateTimeUtils.timestampAddFullInterval( | ||
| start.asInstanceOf[Long], 0 - itvl.months, 0 - itvl.microseconds) | ||
| } | ||
| } | ||
|
|
||
| override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { | ||
| val dtu = DateTimeUtils.getClass.getName.stripSuffix("$") | ||
| left.dataType match { | ||
| case DateType => | ||
| defineCodeGen(ctx, ev, (sd, i) => { | ||
| s"""$dtu.dateAddFullInterval($sd, 0 - $i.months, 0 - $i.microseconds)""" | ||
| }) | ||
| case TimestampType => // TimestampType | ||
| defineCodeGen(ctx, ev, (sd, i) => { | ||
| s"""$dtu.timestampAddFullInterval($sd, 0 - $i.months, 0 - $i.microseconds)""" | ||
| }) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns the date that is num_months after start_date. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shall we make argument names consistent with comment? |
||
| */ | ||
| case class AddMonths(left: Expression, right: Expression) | ||
| extends BinaryExpression with ImplicitCastInputTypes { | ||
|
|
||
| override def inputTypes: Seq[AbstractDataType] = Seq(DateType, IntegerType) | ||
|
|
||
| override def dataType: DataType = DateType | ||
|
|
||
| override def nullSafeEval(start: Any, months: Any): Any = { | ||
| DateTimeUtils.dateAddYearMonthInterval(start.asInstanceOf[Int], months.asInstanceOf[Int]) | ||
| } | ||
|
|
||
| override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { | ||
| val dtu = DateTimeUtils.getClass.getName.stripSuffix("$") | ||
| defineCodeGen(ctx, ev, (sd, m) => { | ||
| s"""$dtu.dateAddYearMonthInterval($sd, $m)""" | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns number of months between dates date1 and date2. | ||
| */ | ||
| case class MonthsBetween(left: Expression, right: Expression) | ||
| extends BinaryExpression with ImplicitCastInputTypes { | ||
|
|
||
| override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, TimestampType) | ||
|
|
||
| override def dataType: DataType = DoubleType | ||
|
|
||
| override def nullSafeEval(t1: Any, t2: Any): Any = { | ||
| DateTimeUtils.monthsBetween(t1.asInstanceOf[Long], t2.asInstanceOf[Long]) | ||
| } | ||
|
|
||
| override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { | ||
| val dtu = DateTimeUtils.getClass.getName.stripSuffix("$") | ||
| defineCodeGen(ctx, ev, (l, r) => { | ||
| s"""$dtu.monthsBetween($l, $r)""" | ||
| }) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -573,4 +573,109 @@ object DateTimeUtils { | |
| dayInYear - 334 | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns the date value for the first day of the given month. | ||
| * The month is expressed in months since 1.1.1970, starting from 0. | ||
| */ | ||
| def getDaysFromMonths(months: Int): Int = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
| val yearSinceEpoch = if (months < 0) months / 12 - 1 else months / 12 | ||
| val monthInYearFromZero = months - yearSinceEpoch * 12 | ||
| val daysFromYears = getDaysFromYears(yearSinceEpoch) | ||
| val febDays = if (isLeapYear(1970 + yearSinceEpoch)) 29 else 28 | ||
| val daysForMonths = Seq(31, febDays, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is a common part from #6986 , have changed there.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. when you are updating this pr, you could reuse |
||
| daysForMonths.slice(0, monthInYearFromZero).sum + daysFromYears | ||
| } | ||
|
|
||
| /** | ||
| * Returns the date value for January 1 of the given year. | ||
| * The year is expressed in years since 1.1.1970, starting from 0. | ||
| */ | ||
| def getDaysFromYears(years: Int): Int = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| val remainYear = (years % 400 + 400) % 400 | ||
| val cycles = (years - remainYear) / 400 | ||
| val numLeaps = if (remainYear < 130) { | ||
| remainYear / 4 | ||
| } else if (remainYear < 230) { | ||
| remainYear / 4 - 1 | ||
| } else if (remainYear < 330) { | ||
| remainYear / 4 - 2 | ||
| } else { | ||
| remainYear / 4 - 3 | ||
| } | ||
| cycles * (365 * 400 + 397) + remainYear * 365 + numLeaps | ||
| } | ||
|
|
||
| /** | ||
| * Add date and year-month interval. | ||
| * Returns a date value, expressed in days since 1.1.1970. | ||
| */ | ||
| def dateAddYearMonthInterval(days: Int, months: Int): Int = { | ||
| val currentMonth = (getYear(days) - 1970) * 12 + getMonth(days) - 1 + months | ||
| val currentMonthInYear = if (currentMonth < 0) { | ||
| ((currentMonth % 12) + 12) % 12 | ||
| } else { | ||
| currentMonth % 12 | ||
| } | ||
| val currentYear = if (currentMonth < 0) (currentMonth / 12) - 1 else currentMonth / 12 | ||
| val febDays = if (isLeapYear(1970 + currentYear)) 29 else 28 | ||
| val daysForMonths = Seq(31, febDays, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) | ||
| getDaysFromMonths(currentMonth) + math.min( | ||
| getDayOfMonth(days), daysForMonths(currentMonthInYear)) - 1 | ||
| } | ||
|
|
||
| /** | ||
| * Add date and full interval. | ||
| * Returns a timestamp value, expressed in microseconds since 1.1.1970 00:00:00. | ||
| */ | ||
| def dateAddFullInterval(days: Int, months: Int, microseconds: Long): Long = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not just use Interval as the second argument? |
||
| daysToMillis(dateAddYearMonthInterval(days, months)) * 1000L + microseconds | ||
| } | ||
|
|
||
| /** | ||
| * Add timestamp and full interval. | ||
| * Returns a timestamp value, expressed in microseconds since 1.1.1970 00:00:00. | ||
| */ | ||
| def timestampAddFullInterval(micros: Long, months: Int, microseconds: Long): Long = { | ||
| val days = millisToDays(micros / 1000L) | ||
| dateAddFullInterval(days, months, microseconds) + micros - daysToMillis(days) * 1000L | ||
| } | ||
|
|
||
| /** | ||
| * Returns the last dayInMonth in the month it belongs to. The date is expressed | ||
| * in days since 1.1.1970. the return value starts from 1. | ||
| */ | ||
| def getLastDayInMonthOfMonth(date: Int): Int = { | ||
| val month = getMonth(date) | ||
|
|
||
| val febDay = if (isLeapYear(getYear(date))) 29 else 28 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. only do this if month == 2 |
||
| val days = Seq(31, febDay, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make it static |
||
| days(month - 1) | ||
| } | ||
|
|
||
| /** | ||
| * Returns number of months between time1 and time2. time1 and time2 are expressed in | ||
| * microseconds since 1.1.1970 | ||
| */ | ||
| def monthsBetween(time1: Long, time2: Long): Double = { | ||
| val millis1 = time1.asInstanceOf[Long] / 1000L | ||
| val millis2 = time2.asInstanceOf[Long] / 1000L | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no need to cast here |
||
| val date1 = millisToDays(millis1) | ||
| val date2 = millisToDays(millis2) | ||
| val dayInMonth1 = getDayOfMonth(date1) | ||
| val dayInMonth2 = getDayOfMonth(date2) | ||
| val lastDayMonth1 = getLastDayInMonthOfMonth(date1) | ||
| val lastDayMonth2 = getLastDayInMonthOfMonth(date2) | ||
| val months1 = getYear(date1) * 12 + getMonth(date1) - 1 | ||
| val months2 = getYear(date2) * 12 + getMonth(date2) - 1 | ||
| val timeInDay1 = time1 - daysToMillis(date1) * 1000L | ||
| val timeInDay2 = time2 - daysToMillis(date2) * 1000L | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's a little wired here, can't we just use?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. daysToMillis will also consider timezone offset.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we really need to consider timezone here? time1 and time2 are in same timezone, isn't it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we are calculating month, so we need to use date to see if they are of same day in month or both last day of month, and to get day info we need to consider offset. |
||
| if (dayInMonth1 == dayInMonth2 || (lastDayMonth1 == dayInMonth1 && | ||
| lastDayMonth2 == dayInMonth2)) { | ||
| (months1 - months2).toDouble | ||
| } else { | ||
| val timesBetween = (timeInDay1 - timeInDay2).toDouble / (MILLIS_PER_DAY * 1000) | ||
| (months1 - months2).toDouble + (dayInMonth1 - dayInMonth2 + timesBetween) / 31.0 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we round this to 8 digits?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds reasonable. |
||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,7 +22,9 @@ import java.text.SimpleDateFormat | |
| import java.util.Calendar | ||
|
|
||
| import org.apache.spark.SparkFunSuite | ||
| import org.apache.spark.sql.catalyst.util.DateTimeUtils | ||
| import org.apache.spark.sql.types.{StringType, TimestampType, DateType} | ||
| import org.apache.spark.unsafe.types.Interval | ||
|
|
||
| class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { | ||
|
|
||
|
|
@@ -246,4 +248,57 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { | |
| } | ||
| } | ||
|
|
||
| test("date_add") { | ||
| checkEvaluation( | ||
| DateAdd(Literal(Date.valueOf("2016-02-28")), Literal(1)), | ||
| DateTimeUtils.fromJavaDate(Date.valueOf("2016-02-29"))) | ||
| checkEvaluation( | ||
| DateAdd(Literal(Date.valueOf("2016-02-28")), Literal(-365)), | ||
| DateTimeUtils.fromJavaDate(Date.valueOf("2015-02-28"))) | ||
| } | ||
|
|
||
| test("date_sub") { | ||
| checkEvaluation( | ||
| DateSub(Literal(Date.valueOf("2015-01-01")), Literal(1)), | ||
| DateTimeUtils.fromJavaDate(Date.valueOf("2014-12-31"))) | ||
| checkEvaluation( | ||
| DateSub(Literal(Date.valueOf("2015-01-01")), Literal(-1)), | ||
| DateTimeUtils.fromJavaDate(Date.valueOf("2015-01-02"))) | ||
| } | ||
|
|
||
| test("time_add") { | ||
| checkEvaluation( | ||
| TimeAdd(Literal(Date.valueOf("2016-01-29")), Literal(new Interval(1, 0))), | ||
| DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2016-02-29 00:00:00"))) | ||
| checkEvaluation( | ||
| TimeAdd(Literal(Date.valueOf("2016-01-31")), Literal(new Interval(1, 2000000.toLong))), | ||
| DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2016-02-29 00:00:02"))) | ||
| } | ||
|
|
||
| test("time_sub") { | ||
| checkEvaluation( | ||
| TimeSub(Literal(Timestamp.valueOf("2016-03-31 10:00:00")), Literal(new Interval(1, 0))), | ||
| DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2016-02-29 10:00:00"))) | ||
| checkEvaluation( | ||
| TimeSub( | ||
| Literal(Timestamp.valueOf("2016-03-30 00:00:01")), | ||
| Literal(new Interval(1, 2000000.toLong))), | ||
| DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2016-02-28 23:59:59"))) | ||
| } | ||
|
|
||
| test("add_months") { | ||
| checkEvaluation(AddMonths(Literal(Date.valueOf( | ||
| "2015-01-30")), Literal(1)), DateTimeUtils.fromJavaDate(Date.valueOf("2015-02-28"))) | ||
| checkEvaluation(AddMonths(Literal(Date.valueOf( | ||
| "2016-03-30")), Literal(-1)), DateTimeUtils.fromJavaDate(Date.valueOf("2016-02-29"))) | ||
| } | ||
|
|
||
| test("months_between") { | ||
| checkEvaluation(MonthsBetween(Literal(Timestamp.valueOf( | ||
| "2015-01-30 11:52:00")), Literal(Timestamp.valueOf("2015-01-30 11:50:00"))), 0.0) | ||
| checkEvaluation(MonthsBetween(Literal(Timestamp.valueOf( | ||
| "2015-01-31 00:00:00")), Literal(Timestamp.valueOf("2015-03-31 22:00:00"))), -2.0) | ||
| checkEvaluation(MonthsBetween(Literal(Timestamp.valueOf( | ||
| "2015-03-31 22:00:00")), Literal(Timestamp.valueOf("2015-02-28 00:00:00"))), 1.0) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More test cast here to guarantee the behaviour of months_between. It's complicated and many conner cases exist I think
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i think we want to support:
timestamp + interval returns timestamp
date + interval returns date
date + int returns date ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, will update today. BTW, How do we tell the difference if we are doing
string + interval/int,
should we just take strings as date?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
string can get implicitly cast to date or timestamp, can't it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but then we have to decide it is date or timestamp, and that need to be done at run time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i think we can just put timestamp first, and as a result implicitly cast string to timestamp.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think date_add('2015-07-22', 1) return '2015-07-23 00:00:00' is not so natural...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I just checked with hive 1.2.
hive> select date_add("2015-01-01 00:11:22", 2);
2015-01-03
hive> select date_add(cast("2015-01-01 00:11:22" as timestamp), 2);
2015-01-03
shall we use the same pattern?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
date_addis used for adding days, where does hive supporttimestamp + interval? in+or another special operator?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hive and oracle use + sign
select birthday + interval 3 days from xxx;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK that makes sense. Let's use date for string then.