Skip to content

Commit fc01ddb

Browse files
committed
Add support for Hive's LOAD DATA expr
1 parent 334a5bf commit fc01ddb

File tree

7 files changed

+277
-11
lines changed

7 files changed

+277
-11
lines changed

src/ast/mod.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3317,6 +3317,22 @@ pub enum Statement {
33173317
channel: Ident,
33183318
payload: Option<String>,
33193319
},
3320+
/// ```sql
3321+
/// LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE tablename
3322+
/// [PARTITION (partcol1=val1, partcol2=val2 ...)]
3323+
/// [INPUTFORMAT 'inputformat' SERDE 'serde']
3324+
/// ```
3325+
/// Loading files into tables
3326+
///
3327+
/// See Hive <https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=27362036#LanguageManualDML-Loadingfilesintotables>
3328+
LoadData {
3329+
local: bool,
3330+
inpath: String,
3331+
overwrite: bool,
3332+
table_name: ObjectName,
3333+
partitioned: Option<Vec<Expr>>,
3334+
table_format: Option<HiveLoadDataOption>,
3335+
},
33203336
}
33213337

33223338
impl fmt::Display for Statement {
@@ -3919,6 +3935,36 @@ impl fmt::Display for Statement {
39193935
Ok(())
39203936
}
39213937
Statement::CreateTable(create_table) => create_table.fmt(f),
3938+
Statement::LoadData {
3939+
local,
3940+
inpath,
3941+
overwrite,
3942+
table_name,
3943+
partitioned,
3944+
table_format,
3945+
} => {
3946+
write!(
3947+
f,
3948+
"LOAD DATA {local}INPATH '{inpath}' {overwrite}INTO TABLE {table_name}",
3949+
local = if *local { "LOCAL " } else { "" },
3950+
inpath = inpath,
3951+
overwrite = if *overwrite { "OVERWRITE " } else { "" },
3952+
table_name = table_name,
3953+
)?;
3954+
if let Some(ref parts) = &partitioned {
3955+
if !parts.is_empty() {
3956+
write!(f, " PARTITION ({})", display_comma_separated(parts))?;
3957+
}
3958+
}
3959+
if let Some(HiveLoadDataOption {
3960+
serde,
3961+
input_format,
3962+
}) = &table_format
3963+
{
3964+
write!(f, " INPUTFORMAT {input_format} SERDE {serde}")?;
3965+
}
3966+
Ok(())
3967+
}
39223968
Statement::CreateVirtualTable {
39233969
name,
39243970
if_not_exists,
@@ -5811,6 +5857,14 @@ pub enum HiveRowFormat {
58115857
DELIMITED { delimiters: Vec<HiveRowDelimiter> },
58125858
}
58135859

5860+
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
5861+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
5862+
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
5863+
pub struct HiveLoadDataOption {
5864+
pub serde: Expr,
5865+
pub input_format: Expr,
5866+
}
5867+
58145868
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
58155869
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
58165870
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]

src/dialect/duckdb.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,9 @@ impl Dialect for DuckDbDialect {
6666
fn supports_explain_with_utility_options(&self) -> bool {
6767
true
6868
}
69+
70+
/// See DuckDB <https://duckdb.org/docs/sql/statements/load_and_install.html#load>
71+
fn supports_load_extension(&self) -> bool {
72+
true
73+
}
6974
}

src/dialect/hive.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,9 @@ impl Dialect for HiveDialect {
5151
fn require_interval_qualifier(&self) -> bool {
5252
true
5353
}
54+
55+
/// See Hive <https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=27362036#LanguageManualDML-Loadingfilesintotables>
56+
fn supports_load_data(&self) -> bool {
57+
true
58+
}
5459
}

src/dialect/mod.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,16 @@ pub trait Dialect: Debug + Any {
601601
false
602602
}
603603

604+
/// Returns true if the dialect supports the `LOAD DATA` statement
605+
fn supports_load_data(&self) -> bool {
606+
false
607+
}
608+
609+
/// Returns true if the dialect supports the `LOAD extension` statement
610+
fn supports_load_extension(&self) -> bool {
611+
false
612+
}
613+
604614
/// Returns true if this dialect expects the the `TOP` option
605615
/// before the `ALL`/`DISTINCT` options in a `SELECT` statement.
606616
fn supports_top_before_distinct(&self) -> bool {

src/keywords.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@ define_keywords!(
387387
INITIALLY,
388388
INNER,
389389
INOUT,
390+
INPATH,
390391
INPUT,
391392
INPUTFORMAT,
392393
INSENSITIVE,

src/parser/mod.rs

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,16 +27,15 @@ use core::{
2727

2828
use log::debug;
2929

30-
use recursion::RecursionCounter;
31-
use IsLateral::*;
32-
use IsOptional::*;
33-
3430
use crate::ast::helpers::stmt_create_table::{CreateTableBuilder, CreateTableConfiguration};
3531
use crate::ast::Statement::CreatePolicy;
3632
use crate::ast::*;
3733
use crate::dialect::*;
3834
use crate::keywords::{Keyword, ALL_KEYWORDS};
3935
use crate::tokenizer::*;
36+
use recursion::RecursionCounter;
37+
use IsLateral::*;
38+
use IsOptional::*;
4039

4140
mod alter;
4241

@@ -543,10 +542,7 @@ impl<'a> Parser<'a> {
543542
Keyword::INSTALL if dialect_of!(self is DuckDbDialect | GenericDialect) => {
544543
self.parse_install()
545544
}
546-
// `LOAD` is duckdb specific https://duckdb.org/docs/extensions/overview
547-
Keyword::LOAD if dialect_of!(self is DuckDbDialect | GenericDialect) => {
548-
self.parse_load()
549-
}
545+
Keyword::LOAD => self.parse_load(),
550546
// `OPTIMIZE` is clickhouse specific https://clickhouse.tech/docs/en/sql-reference/statements/optimize/
551547
Keyword::OPTIMIZE if dialect_of!(self is ClickHouseDialect | GenericDialect) => {
552548
self.parse_optimize_table()
@@ -11040,6 +11036,22 @@ impl<'a> Parser<'a> {
1104011036
}
1104111037
}
1104211038

11039+
pub fn parse_load_data_table_format(
11040+
&mut self,
11041+
) -> Result<Option<HiveLoadDataOption>, ParserError> {
11042+
if self.parse_keyword(Keyword::INPUTFORMAT) {
11043+
let input_format = self.parse_expr()?;
11044+
self.expect_keyword(Keyword::SERDE)?;
11045+
let serde = self.parse_expr()?;
11046+
Ok(Some(HiveLoadDataOption {
11047+
input_format,
11048+
serde,
11049+
}))
11050+
} else {
11051+
Ok(None)
11052+
}
11053+
}
11054+
1104311055
/// Parse an UPDATE statement, returning a `Box`ed SetExpr
1104411056
///
1104511057
/// This is used to reduce the size of the stack frames in debug builds
@@ -12042,10 +12054,35 @@ impl<'a> Parser<'a> {
1204212054
Ok(Statement::Install { extension_name })
1204312055
}
1204412056

12045-
/// `LOAD [extension_name]`
12057+
/// Parse a SQL LOAD statement
1204612058
pub fn parse_load(&mut self) -> Result<Statement, ParserError> {
12047-
let extension_name = self.parse_identifier(false)?;
12048-
Ok(Statement::Load { extension_name })
12059+
if self.dialect.supports_load_extension() {
12060+
let extension_name = self.parse_identifier(false)?;
12061+
Ok(Statement::Load { extension_name })
12062+
} else if self.parse_keyword(Keyword::DATA) && self.dialect.supports_load_data() {
12063+
let local = self.parse_one_of_keywords(&[Keyword::LOCAL]).is_some();
12064+
self.expect_keyword(Keyword::INPATH)?;
12065+
let inpath = self.parse_literal_string()?;
12066+
let overwrite = self.parse_one_of_keywords(&[Keyword::OVERWRITE]).is_some();
12067+
self.expect_keyword(Keyword::INTO)?;
12068+
self.expect_keyword(Keyword::TABLE)?;
12069+
let table_name = self.parse_object_name(false)?;
12070+
let partitioned = self.parse_insert_partition()?;
12071+
let table_format = self.parse_load_data_table_format()?;
12072+
Ok(Statement::LoadData {
12073+
local,
12074+
inpath,
12075+
overwrite,
12076+
table_name,
12077+
partitioned,
12078+
table_format,
12079+
})
12080+
} else {
12081+
self.expected(
12082+
"Expected: dialect supports `LOAD DATA` or `LOAD extension` to parse `LOAD` statements",
12083+
self.peek_token(),
12084+
)
12085+
}
1204912086
}
1205012087

1205112088
/// ```sql

tests/sqlparser_common.rs

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11523,6 +11523,160 @@ fn parse_notify_channel() {
1152311523
}
1152411524
}
1152511525

11526+
#[test]
11527+
fn parse_load_data() {
11528+
let dialects = all_dialects_where(|d| d.supports_load_data());
11529+
11530+
match dialects
11531+
.verified_stmt("LOAD DATA INPATH '/local/path/to/data.txt' INTO TABLE test.my_table")
11532+
{
11533+
Statement::LoadData {
11534+
local,
11535+
inpath,
11536+
overwrite,
11537+
table_name,
11538+
partitioned,
11539+
table_format,
11540+
} => {
11541+
assert_eq!(false, local);
11542+
assert_eq!("/local/path/to/data.txt", inpath);
11543+
assert_eq!(false, overwrite);
11544+
assert_eq!(
11545+
ObjectName(vec![Ident::new("test"), Ident::new("my_table")]),
11546+
table_name
11547+
);
11548+
assert_eq!(None, partitioned);
11549+
assert_eq!(None, table_format);
11550+
}
11551+
_ => unreachable!(),
11552+
};
11553+
11554+
// with OVERWRITE keyword
11555+
match dialects
11556+
.verified_stmt("LOAD DATA INPATH '/local/path/to/data.txt' OVERWRITE INTO TABLE my_table")
11557+
{
11558+
Statement::LoadData {
11559+
local,
11560+
inpath,
11561+
overwrite,
11562+
table_name,
11563+
partitioned,
11564+
table_format,
11565+
} => {
11566+
assert_eq!(false, local);
11567+
assert_eq!("/local/path/to/data.txt", inpath);
11568+
assert_eq!(true, overwrite);
11569+
assert_eq!(ObjectName(vec![Ident::new("my_table")]), table_name);
11570+
assert_eq!(None, partitioned);
11571+
assert_eq!(None, table_format);
11572+
}
11573+
_ => unreachable!(),
11574+
};
11575+
11576+
// with LOCAL keyword
11577+
match dialects
11578+
.verified_stmt("LOAD DATA LOCAL INPATH '/local/path/to/data.txt' INTO TABLE test.my_table")
11579+
{
11580+
Statement::LoadData {
11581+
local,
11582+
inpath,
11583+
overwrite,
11584+
table_name,
11585+
partitioned,
11586+
table_format,
11587+
} => {
11588+
assert_eq!(true, local);
11589+
assert_eq!("/local/path/to/data.txt", inpath);
11590+
assert_eq!(false, overwrite);
11591+
assert_eq!(
11592+
ObjectName(vec![Ident::new("test"), Ident::new("my_table")]),
11593+
table_name
11594+
);
11595+
assert_eq!(None, partitioned);
11596+
assert_eq!(None, table_format);
11597+
}
11598+
_ => unreachable!(),
11599+
};
11600+
11601+
// with PARTITION clause
11602+
match dialects.verified_stmt("LOAD DATA LOCAL INPATH '/local/path/to/data.txt' INTO TABLE my_table PARTITION (year = 2024, month = 11)") {
11603+
Statement::LoadData {local, inpath, overwrite, table_name, partitioned, table_format} => {
11604+
assert_eq!(true, local);
11605+
assert_eq!("/local/path/to/data.txt", inpath);
11606+
assert_eq!(false, overwrite);
11607+
assert_eq!(ObjectName(vec![Ident::new("my_table")]), table_name);
11608+
assert_eq!(Some(vec![
11609+
Expr::BinaryOp{
11610+
left: Box::new(Expr::Identifier(Ident::new("year"))),
11611+
op: BinaryOperator::Eq,
11612+
right: Box::new(Expr::Value(Value::Number("2024".parse().unwrap(), false))),
11613+
},
11614+
Expr::BinaryOp{
11615+
left: Box::new(Expr::Identifier(Ident::new("month"))),
11616+
op: BinaryOperator::Eq,
11617+
right: Box::new(Expr::Value(Value::Number("11".parse().unwrap(), false))),
11618+
}]), partitioned);
11619+
assert_eq!(None, table_format);
11620+
}
11621+
_ => unreachable!(),
11622+
};
11623+
11624+
// with PARTITION clause
11625+
match dialects.verified_stmt("LOAD DATA LOCAL INPATH '/local/path/to/data.txt' OVERWRITE INTO TABLE good.my_table PARTITION (year = 2024, month = 11) INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'") {
11626+
Statement::LoadData {local, inpath, overwrite, table_name, partitioned, table_format} => {
11627+
assert_eq!(true, local);
11628+
assert_eq!("/local/path/to/data.txt", inpath);
11629+
assert_eq!(true, overwrite);
11630+
assert_eq!(ObjectName(vec![Ident::new("good"), Ident::new("my_table")]), table_name);
11631+
assert_eq!(Some(vec![
11632+
Expr::BinaryOp{
11633+
left: Box::new(Expr::Identifier(Ident::new("year"))),
11634+
op: BinaryOperator::Eq,
11635+
right: Box::new(Expr::Value(Value::Number("2024".parse().unwrap(), false))),
11636+
},
11637+
Expr::BinaryOp{
11638+
left: Box::new(Expr::Identifier(Ident::new("month"))),
11639+
op: BinaryOperator::Eq,
11640+
right: Box::new(Expr::Value(Value::Number("11".parse().unwrap(), false))),
11641+
}]), partitioned);
11642+
assert_eq!(Some(HiveLoadDataOption {serde: Expr::Value(Value::SingleQuotedString("org.apache.hadoop.hive.serde2.OpenCSVSerde".to_string())), input_format: Expr::Value(Value::SingleQuotedString("org.apache.hadoop.mapred.TextInputFormat".to_string()))}), table_format);
11643+
}
11644+
_ => unreachable!(),
11645+
};
11646+
11647+
let dialects = all_dialects_where(|d| !d.supports_load_data() && d.supports_load_extension());
11648+
11649+
assert_eq!(
11650+
dialects
11651+
.parse_sql_statements(
11652+
"LOAD DATA LOCAL INPATH '/local/path/to/data.txt' INTO TABLE test.my_table"
11653+
)
11654+
.unwrap_err(),
11655+
ParserError::ParserError("Expected: end of statement, found: LOCAL".to_string())
11656+
);
11657+
11658+
assert_eq!(
11659+
dialects
11660+
.parse_sql_statements(
11661+
"LOAD DATA INPATH '/local/path/to/data.txt' INTO TABLE test.my_table"
11662+
)
11663+
.unwrap_err(),
11664+
ParserError::ParserError("Expected: end of statement, found: INPATH".to_string())
11665+
);
11666+
11667+
let dialects = all_dialects_where(|d| !d.supports_load_data() && !d.supports_load_extension());
11668+
11669+
assert_eq!(
11670+
dialects.parse_sql_statements("LOAD DATA LOCAL INPATH '/local/path/to/data.txt' INTO TABLE test.my_table").unwrap_err(),
11671+
ParserError::ParserError("Expected: Expected: dialect supports `LOAD DATA` or `LOAD extension` to parse `LOAD` statements, found: LOCAL".to_string())
11672+
);
11673+
11674+
assert_eq!(
11675+
dialects.parse_sql_statements("LOAD DATA INPATH '/local/path/to/data.txt' INTO TABLE test.my_table").unwrap_err(),
11676+
ParserError::ParserError("Expected: Expected: dialect supports `LOAD DATA` or `LOAD extension` to parse `LOAD` statements, found: INPATH".to_string())
11677+
);
11678+
}
11679+
1152611680
#[test]
1152711681
fn test_select_top() {
1152811682
let dialects = all_dialects_where(|d| d.supports_top_before_distinct());

0 commit comments

Comments
 (0)