From 7b82a6f507e97935449ee71c4d07bc587c159d49 Mon Sep 17 00:00:00 2001 From: Abraham Egnor Date: Tue, 3 Jun 2025 11:34:57 -0400 Subject: [PATCH] RUST-1798 use simd to optimize utf8 validation Co-authored-by: Liyixin95 <601947961@qq.com> --- Cargo.lock | 7 +++++++ Cargo.toml | 1 + src/raw/error.rs | 6 ++---- src/raw/mod.rs | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cf5216a3..4881faa0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -140,6 +140,7 @@ dependencies = [ "serde_json", "serde_path_to_error", "serde_with", + "simdutf8", "time", "uuid", ] @@ -959,6 +960,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + [[package]] name = "strsim" version = "0.11.1" diff --git a/Cargo.toml b/Cargo.toml index 44c2eb37..b2eaf819 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,6 +65,7 @@ serde_with = { version = "3.1.0", optional = true } time = { version = "0.3.9", features = ["formatting", "parsing", "macros", "large-dates"] } bitvec = "1.0.1" serde_path_to_error = { version = "0.1.16", optional = true } +simdutf8 = "0.1.5" [target.'cfg(all(target_arch = "wasm32", target_os = "unknown"))'.dependencies] js-sys = "0.3" diff --git a/src/raw/error.rs b/src/raw/error.rs index e6ab8fbc..a9912d90 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -1,5 +1,3 @@ -use std::str::Utf8Error; - use crate::spec::ElementType; /// An error that occurs when attempting to parse raw BSON bytes. @@ -44,7 +42,7 @@ pub enum ErrorKind { MalformedValue { message: String }, /// Improper UTF-8 bytes were found when proper UTF-8 was expected. - Utf8EncodingError(Utf8Error), + Utf8EncodingError, } impl std::fmt::Display for Error { @@ -60,7 +58,7 @@ impl std::fmt::Display for Error { ErrorKind::MalformedValue { message } => { write!(f, "{}malformed value: {:?}", prefix, message) } - ErrorKind::Utf8EncodingError(e) => write!(f, "{}utf-8 encoding error: {}", prefix, e), + ErrorKind::Utf8EncodingError => write!(f, "{}utf-8 encoding error", prefix), } } } diff --git a/src/raw/mod.rs b/src/raw/mod.rs index d6bcc2c9..4e1dd0d6 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -271,7 +271,7 @@ fn read_lenencode(buf: &[u8]) -> Result<&str> { } fn try_to_str(data: &[u8]) -> Result<&str> { - std::str::from_utf8(data).map_err(|e| Error::new(ErrorKind::Utf8EncodingError(e))) + simdutf8::basic::from_utf8(data).map_err(|_| Error::new(ErrorKind::Utf8EncodingError)) } fn usize_try_from_i32(i: i32) -> Result {