From 05ff96b9d3326be46a3adc2f9a363d535b632a83 Mon Sep 17 00:00:00 2001 From: Alex Gaynor Date: Wed, 13 Nov 2024 11:53:13 -0500 Subject: [PATCH] Initial work to add xlsx file loading Doesn't compile, and violates the project name. Alas, it's useful! --- Cargo.lock | 188 +++++++++++++++++++++++++++++++++++++++++++++++++- Cargo.toml | 1 + src/lib.rs | 2 +- src/loader.rs | 68 ++++++++++++++++-- src/main.rs | 17 +++-- 5 files changed, 266 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 912c47a3..62a8014b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "ahash" version = "0.8.11" @@ -78,6 +84,15 @@ version = "1.0.93" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" +[[package]] +name = "arbitrary" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "autocfg" version = "1.4.0" @@ -96,6 +111,27 @@ version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "calamine" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1" +dependencies = [ + "byteorder", + "codepage", + "encoding_rs", + "log", + "quick-xml", + "serde", + "zip", +] + [[package]] name = "cfg-if" version = "1.0.0" @@ -157,6 +193,15 @@ dependencies = [ "error-code", ] +[[package]] +name = "codepage" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4" +dependencies = [ + "encoding_rs", +] + [[package]] name = "colorchoice" version = "1.0.3" @@ -188,6 +233,21 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + [[package]] name = "crossterm" version = "0.28.1" @@ -236,6 +296,7 @@ name = "csvsql" version = "0.1.0" dependencies = [ "anyhow", + "calamine", "clap", "comfy-table", "csv", @@ -246,6 +307,17 @@ dependencies = [ "rustyline", ] +[[package]] +name = "derive_arbitrary" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "dirs" version = "5.0.1" @@ -267,18 +339,44 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "encode_unicode" version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "endian-type" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.9" @@ -318,6 +416,16 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "flate2" +version = "1.0.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -338,13 +446,19 @@ dependencies = [ "ahash", ] +[[package]] +name = "hashbrown" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3" + [[package]] name = "hashlink" version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ba4ff7128dee98c7dc9794b6a411377e1404dba1c97deb8d1a55297bd25d8af" dependencies = [ - "hashbrown", + "hashbrown 0.14.5", ] [[package]] @@ -362,6 +476,16 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "indexmap" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +dependencies = [ + "equivalent", + "hashbrown 0.15.1", +] + [[package]] name = "indicatif" version = "0.17.9" @@ -444,6 +568,12 @@ dependencies = [ "scopeguard", ] +[[package]] +name = "lockfree-object-pool" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9374ef4228402d4b7e403e5838cb880d9ee663314b0a900d5a6aabf0c213552e" + [[package]] name = "log" version = "0.4.22" @@ -456,6 +586,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "nibble_vec" version = "0.1.0" @@ -539,6 +678,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "encoding_rs", + "memchr", +] + [[package]] name = "quote" version = "1.0.37" @@ -694,6 +843,12 @@ dependencies = [ "syn", ] +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "smallvec" version = "1.13.2" @@ -1058,3 +1213,34 @@ dependencies = [ "quote", "syn", ] + +[[package]] +name = "zip" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc5e4288ea4057ae23afc69a4472434a87a2495cafce6632fd1c4ec9f5cf3494" +dependencies = [ + "arbitrary", + "crc32fast", + "crossbeam-utils", + "displaydoc", + "flate2", + "indexmap", + "memchr", + "thiserror", + "zopfli", +] + +[[package]] +name = "zopfli" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5019f391bac5cf252e93bbcc53d039ffd62c7bfb7c150414d61369afe57e946" +dependencies = [ + "bumpalo", + "crc32fast", + "lockfree-object-pool", + "log", + "once_cell", + "simd-adler32", +] diff --git a/Cargo.toml b/Cargo.toml index 880d5d93..7d6e33c2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,3 +14,4 @@ rustyline = "15" clap = { version = "4.5.21", features = ["derive"] } dirs = "5.0" anyhow = "1.0.93" +calamine = "0.26.1" diff --git a/src/lib.rs b/src/lib.rs index e21ac8cb..b4cd3d19 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,3 @@ pub mod loader; -pub use crate::loader::{CsvLoader, ExactSizeIterable, Loader}; +pub use crate::loader::{CsvLoader, ExactSizeIterable, Loader, XlsxLoader}; diff --git a/src/loader.rs b/src/loader.rs index 62c7a791..98211994 100644 --- a/src/loader.rs +++ b/src/loader.rs @@ -1,12 +1,14 @@ use std::fs::File; use std::iter; +use calamine::Reader; + pub trait ExactSizeIterable { - fn iter(&self) -> impl iter::ExactSizeIterator; + fn iter(&self) -> impl iter::ExactSizeIterator>; } impl ExactSizeIterable for csv::ByteRecord { - fn iter(&self) -> impl iter::ExactSizeIterator { + fn iter(&self) -> impl iter::ExactSizeIterator> { self.into_iter() } } @@ -27,7 +29,7 @@ pub trait Loader { fn progress_position(&self) -> u64; /// Returns the names of fields, as they exist in the underlying data. - fn raw_fields(&mut self) -> anyhow::Result>; + fn raw_fields(&mut self) -> anyhow::Result>>; fn next_record(&mut self) -> Option>; } @@ -68,7 +70,7 @@ impl Loader for CsvLoader<'_> { self.records.reader().position().byte() } - fn raw_fields(&mut self) -> anyhow::Result> { + fn raw_fields(&mut self) -> anyhow::Result>> { Ok(self.records.reader_mut().headers()?.iter()) } @@ -80,3 +82,61 @@ impl Loader for CsvLoader<'_> { } } } + +pub struct XlsxLoader<'a> { + path: &'a str, + data_range: calamine::Range, + pos: usize +} + +impl<'a> XlsxLoader<'a> { + pub fn new(path: &'a str) -> anyhow::Result { + let mut wb: calamine::Xlsx<_> = calamine::open_workbook(path)?; + let data_range = wb + .worksheet_range_at(0) + .ok_or_else(|| anyhow::anyhow!("No worksheet in xlsx"))??; + + Ok(XlsxLoader { path, data_range, pos: 0 }) + } +} + +pub struct XlsxRecord(Vec>); + +impl ExactSizeIterable for XlsxRecord { + fn iter(&self) -> impl iter::ExactSizeIterator> { + self.0.iter() + } +} + +impl Loader for XlsxLoader<'_> { + type RecordType = XlsxRecord; + + fn name(&self) -> &str { + self.path + } + + fn progress_size(&self) -> u64 { + self.data_range.rows().len().try_into().unwrap() + } + + fn progress_position(&self) -> u64 { + todo!() + } + + fn raw_fields(&mut self) -> anyhow::Result>> { + Ok(self + .data_range + .headers() + .ok_or_else(|| anyhow::anyhow!("No rows in xlsx"))? + .into_iter()) + } + + fn next_record(&mut self) -> Option> { + self.pos += 1; + let row = self.data_range.rows().skip(self.pos).next()?; + let record = XlsxRecord(row.iter().map(|v| { + b"abc".to_vec() + }).collect()); + Some(Ok(record)) + } +} diff --git a/src/main.rs b/src/main.rs index 11cfd1d1..a0f78ac5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,6 @@ use clap::Parser; use std::iter; +use std::path::Path; use std::sync::LazyLock; use csvsql::ExactSizeIterable; @@ -41,9 +42,16 @@ fn _load_table_from_path( path: &str, delimiter: u8, ) -> anyhow::Result> { - let loader = csvsql::CsvLoader::new(path, delimiter)?; - - _load_table_from_loader(db, table_name, loader) + match Path::new(path).extension().map(|e| e.as_encoded_bytes()) { + Some(b"xlsx") => { + let loader = csvsql::XlsxLoader::new(path)?; + _load_table_from_loader(db, table_name, loader) + } + _ => { + let loader = csvsql::CsvLoader::new(path, delimiter)?; + _load_table_from_loader(db, table_name, loader) + } + } } fn _load_table_from_loader( @@ -57,7 +65,7 @@ fn _load_table_from_loader( let normalized_cols = loader .raw_fields()? - .map(normalize_col) + .map(|c| normalize_col(c.as_ref())) .fold(vec![], |mut v, orig_col| { let mut col = orig_col.clone(); let mut i = 1; @@ -92,6 +100,7 @@ fn _load_table_from_loader( let record = record?; let row = record.iter(); let row_len = row.len(); + let row = row.map(|v| v.as_ref()); if row_len > normalized_cols.len() { anyhow::bail!( "Too many fields on row {}, fields: {:?}",