Skip to content
Merged
4 changes: 2 additions & 2 deletions fuzz/generate_corpus.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use bson::{doc, Bson, Decimal128};
use bson::{cstr, doc, Bson, Decimal128};
use std::{
fs,
io::{Error, ErrorKind},
Expand Down Expand Up @@ -64,7 +64,7 @@ fn generate_type_marker_cases(dir: &Path) -> std::io::Result<()> {
"bool": true,
"date": bson::DateTime::now(),
"null": Bson::Null,
"regex": Bson::RegularExpression(bson::Regex { pattern: "pattern".into(), options: "i".into() }),
"regex": Bson::RegularExpression(bson::Regex { pattern: cstr!("pattern").into(), options: cstr!("i").into() }),
"int32": 123i32,
"timestamp": bson::Timestamp { time: 12345, increment: 1 },
"int64": 123i64,
Expand Down
18 changes: 9 additions & 9 deletions serde-tests/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use serde_json::json;

use super::AllTypes;

use bson::{doc, Bson, JavaScriptCodeWithScope, RawArrayBuf, RawBson, RawDocumentBuf};
use bson::{cstr, doc, Bson, JavaScriptCodeWithScope, RawArrayBuf, RawBson, RawDocumentBuf};

use serde::{Deserialize, Serialize};

Expand Down Expand Up @@ -99,18 +99,18 @@ fn owned_raw_bson() {
});

let mut doc_buf = RawDocumentBuf::new();
doc_buf.append("a", "key").unwrap();
doc_buf.append("number", 12).unwrap();
doc_buf.append("bool", false).unwrap();
doc_buf.append("nu", RawBson::Null).unwrap();
doc_buf.append(cstr!("a"), "key");
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This really is the whole story in this one line - the failure point has been shifted from writing bytes to the document buffer (append and everything that used it) to constructing the string, and that in turn can now be done either at run-time or at compile-time if it's just a literal.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be possible to provide support for the c"string" syntax here, and more generally interop with the equivalent std::ffi types? I think that would be slightly more ergonomic for string literals since it wouldn't require a macro import.

I'm also wondering how users can construct strings with interior null bytes - is that ever doable with a &static str?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately std::ffi::CStr and the c"string" syntax don't don't require that the text be valid UTF8, so there'd still have to be a validation step at encode time if we used those :/. I added documentation to the bson CStr types to call out the differences (and make usage more clear).

Zero bytes are actually completely valid UTF8 under normal circumstances (although why they'd be in there is anyone's guess), and can be constructed in string literals via \0:

let embedded_zero = "foo\0bar";
println!("{embedded_zero}\n{:?}", embedded_zero.as_bytes());

prints

foobar
[102, 111, 111, 0, 98, 97, 114]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

std::ffi::CStr and the c"string" syntax don't don't require that the text be valid UTF8

For my understanding because my knowledge of these string details is pretty fuzzy - is there somewhere we're enforcing valid UTF-8 for CStr/CString upon construction? I see that we're checking for null bytes in validate_cstr, but don't understand how that's different from the constraints of a std::ffi::CStr. I do get a compiler error for something like:

let string = c"abc\0def";

So I'm wondering what would be valid input for c"..." but not cstr!("...").

(I also missed the difference in inclusion/exclusion of the terminating null byte, thanks for adding that to the docs!)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need to explicitly validate UTF-8 for these types because they only accept &str / String, which are already required to be UTF-8 :). In the code where we're parsing things from a bytestream we've already got the validation code, and likewise any user constructing keys or regexes from strings will already have had to validate the data.

I added some examples to the rustdoc to make it more clear where the validation for bson::raw::CStr is more strict than either str or std::ffi::CStr - for the latter tl;dr is that out of range byte sequences like c"\xc3\x28" are perfectly valid but cstr!("\xc3\x28") won't compile because the string literal "\xc3\x28" itself is invalid.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

excellent, that all makes sense. thank you for the explanations and docs!

doc_buf.append(cstr!("number"), 12);
doc_buf.append(cstr!("bool"), false);
doc_buf.append(cstr!("nu"), RawBson::Null);

let mut array_buf = RawArrayBuf::new();
array_buf.push(1).unwrap();
array_buf.push("string").unwrap();
array_buf.push(1);
array_buf.push("string");

let mut bson_doc = RawDocumentBuf::new();
bson_doc.append("first", true).unwrap();
bson_doc.append("second", "string").unwrap();
bson_doc.append(cstr!("first"), true);
bson_doc.append(cstr!("second"), "string");

let expected = Foo {
doc_buf,
Expand Down
29 changes: 14 additions & 15 deletions serde-tests/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use std::{
};

use bson::{
cstr,
doc,
oid::ObjectId,
spec::BinarySubtype,
Expand Down Expand Up @@ -835,8 +836,8 @@ fn raw_regex() {

let bytes = bson::serialize_to_vec(&doc! {
"r": Regex {
pattern: "a[b-c]d".to_string(),
options: "ab".to_string(),
pattern: cstr!("a[b-c]d").into(),
options: cstr!("ab").into(),
},
})
.expect("raw_regex");
Expand Down Expand Up @@ -927,8 +928,8 @@ impl AllTypes {
};
let date = DateTime::now();
let regex = Regex {
pattern: "hello".to_string(),
options: "x".to_string(),
pattern: cstr!("hello").into(),
options: cstr!("x").into(),
};
let timestamp = Timestamp {
time: 123,
Expand Down Expand Up @@ -1058,8 +1059,8 @@ fn all_raw_types_rmp() {
scope: doc! { "x": 1 },
},
"regex": Regex {
pattern: "pattern".to_string(),
options: "opt".to_string()
pattern: cstr!("pattern").into(),
options: cstr!("opt").into()
}
})
.unwrap();
Expand Down Expand Up @@ -1254,24 +1255,22 @@ fn owned_raw_types() {

let f = Foo {
subdoc: RawDocumentBuf::from_iter([
("a key", RawBson::String("a value".to_string())),
("an objectid", RawBson::ObjectId(oid)),
("a date", RawBson::DateTime(dt)),
(cstr!("a key"), RawBson::String("a value".to_string())),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Building from a key-value list is one place where the repeated cstr! is more of a hassle; I could potentially see providing a convenience try_from_iter method that accepts &str keys and returns a Result (i.e. like this used to be).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense, seems like something we can add if there's user demand for it

(cstr!("an objectid"), RawBson::ObjectId(oid)),
(cstr!("a date"), RawBson::DateTime(dt)),
(
"code_w_scope",
cstr!("code_w_scope"),
RawBson::JavaScriptCodeWithScope(raw_code_w_scope.clone()),
),
("decimal128", RawBson::Decimal128(d128)),
])
.unwrap(),
(cstr!("decimal128"), RawBson::Decimal128(d128)),
]),
array: RawArrayBuf::from_iter([
RawBson::String("a string".to_string()),
RawBson::ObjectId(oid),
RawBson::DateTime(dt),
RawBson::JavaScriptCodeWithScope(raw_code_w_scope),
RawBson::Decimal128(d128),
])
.unwrap(),
]),
};

let expected = doc! {
Expand Down
36 changes: 24 additions & 12 deletions src/bson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ use std::{
use serde_json::{json, Value};

pub use crate::document::Document;
use crate::{base64, oid, spec::ElementType, Binary, Decimal128};
use crate::{base64, oid, raw::CString, spec::ElementType, Binary, Decimal128};

/// Possible BSON value types.
#[derive(Clone, Default, PartialEq)]
Expand Down Expand Up @@ -268,6 +268,12 @@ impl From<String> for Bson {
}
}

impl From<crate::raw::CString> for Bson {
fn from(a: crate::raw::CString) -> Bson {
Bson::String(a.into_string())
}
}

impl From<Document> for Bson {
fn from(a: Document) -> Bson {
Bson::Document(a)
Expand Down Expand Up @@ -480,14 +486,14 @@ impl Bson {
Bson::Boolean(v) => json!(v),
Bson::Null => Value::Null,
Bson::RegularExpression(Regex { pattern, options }) => {
let mut chars: Vec<_> = options.chars().collect();
let mut chars: Vec<_> = options.as_str().chars().collect();
chars.sort_unstable();

let options: String = chars.into_iter().collect();

json!({
"$regularExpression": {
"pattern": pattern,
"pattern": pattern.into_string(),
"options": options,
}
})
Expand Down Expand Up @@ -619,7 +625,7 @@ impl Bson {
ref pattern,
ref options,
}) => {
let mut chars: Vec<_> = options.chars().collect();
let mut chars: Vec<_> = options.as_str().chars().collect();
chars.sort_unstable();

let options: String = chars.into_iter().collect();
Expand Down Expand Up @@ -842,7 +848,9 @@ impl Bson {
if let Ok(regex) = doc.get_document("$regularExpression") {
if let Ok(pattern) = regex.get_str("pattern") {
if let Ok(options) = regex.get_str("options") {
return Bson::RegularExpression(Regex::new(pattern, options));
if let Ok(regex) = Regex::from_strings(pattern, options) {
return Bson::RegularExpression(regex);
}
}
}
}
Expand Down Expand Up @@ -1147,7 +1155,7 @@ impl Timestamp {
#[derive(Debug, Clone, Eq, PartialEq, Hash)]
pub struct Regex {
/// The regex pattern to match.
pub pattern: String,
pub pattern: CString,

/// The options for the regex.
///
Expand All @@ -1156,18 +1164,22 @@ pub struct Regex {
/// multiline matching, 'x' for verbose mode, 'l' to make \w, \W, etc. locale dependent,
/// 's' for dotall mode ('.' matches everything), and 'u' to make \w, \W, etc. match
/// unicode.
pub options: String,
pub options: CString,
}

impl Regex {
pub(crate) fn new(pattern: impl AsRef<str>, options: impl AsRef<str>) -> Self {
#[cfg(any(test, feature = "serde"))]
pub(crate) fn from_strings(
pattern: impl AsRef<str>,
options: impl AsRef<str>,
) -> crate::error::Result<Self> {
let mut chars: Vec<_> = options.as_ref().chars().collect();
chars.sort_unstable();
let options: String = chars.into_iter().collect();
Self {
pattern: pattern.as_ref().to_string(),
options,
}
Ok(Self {
pattern: pattern.as_ref().to_string().try_into()?,
options: options.try_into()?,
})
}
}

Expand Down
8 changes: 4 additions & 4 deletions src/de/raw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1306,15 +1306,15 @@ impl<'de> serde::de::Deserializer<'de> for &mut RegexAccess<'de> {
RegexDeserializationStage::Pattern => {
self.stage = RegexDeserializationStage::Options;
match &self.re {
BsonCow::Borrowed(re) => visitor.visit_borrowed_str(re.pattern),
BsonCow::Owned(re) => visitor.visit_str(&re.pattern),
BsonCow::Borrowed(re) => visitor.visit_borrowed_str(re.pattern.as_str()),
BsonCow::Owned(re) => visitor.visit_str(re.pattern.as_str()),
}
}
RegexDeserializationStage::Options => {
self.stage = RegexDeserializationStage::Done;
match &self.re {
BsonCow::Borrowed(re) => visitor.visit_borrowed_str(re.options),
BsonCow::Owned(re) => visitor.visit_str(&re.options),
BsonCow::Borrowed(re) => visitor.visit_borrowed_str(re.options.as_str()),
BsonCow::Owned(re) => visitor.visit_str(re.options.as_str()),
}
}
RegexDeserializationStage::Done => {
Expand Down
5 changes: 4 additions & 1 deletion src/de/serde.rs
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,10 @@ impl<'de> Visitor<'de> for BsonVisitor {

"$regularExpression" => {
let re = visitor.next_value::<extjson::models::RegexBody>()?;
return Ok(Bson::RegularExpression(Regex::new(re.pattern, re.options)));
return Ok(Bson::RegularExpression(
Regex::from_strings(re.pattern, re.options)
.map_err(serde::de::Error::custom)?,
));
}

"$dbPointer" => {
Expand Down
2 changes: 1 addition & 1 deletion src/extjson/de.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ impl TryFrom<serde_json::Map<String, serde_json::Value>> for Bson {

if obj.contains_key("$regularExpression") {
let regex: models::Regex = serde_json::from_value(obj.into())?;
return Ok(regex.parse().into());
return Ok(regex.parse()?.into());
}

if obj.contains_key("$numberInt") {
Expand Down
4 changes: 2 additions & 2 deletions src/extjson/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ pub(crate) struct RegexBody {
}

impl Regex {
pub(crate) fn parse(self) -> crate::Regex {
crate::Regex::new(self.body.pattern, self.body.options)
pub(crate) fn parse(self) -> crate::error::Result<crate::Regex> {
crate::Regex::from_strings(self.body.pattern, self.body.options)
}
}

Expand Down
25 changes: 18 additions & 7 deletions src/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,12 +240,12 @@ macro_rules! rawbson {

// Finished with trailing comma.
(@array [$($elems:expr,)*]) => {
$crate::RawArrayBuf::from_iter(vec![$($elems,)*]).expect("invalid bson value")
$crate::RawArrayBuf::from_iter(vec![$($elems,)*])
};

// Finished without trailing comma.
(@array [$($elems:expr),*]) => {
$crate::RawArrayBuf::from_iter(vec![$($elems),*]).expect("invalid bson value")
$crate::RawArrayBuf::from_iter(vec![$($elems),*])
};

// Next element is `null`.
Expand Down Expand Up @@ -291,15 +291,26 @@ macro_rules! rawbson {
// Finished.
(@object $object:ident () () ()) => {};

// Insert the current entry followed by trailing comma.
(@object $object:ident [$($key:tt)+] ($value:expr) , $($rest:tt)*) => {
$object.append(($($key)+), $value).expect("invalid bson value");
// Insert the current entry with followed by trailing comma, with a key literal.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tweaked the behavior of rawdoc! a bit here - now if the key is a literal it'll be implicitly wrapped in cstr! so it gets compile-time validated, otherwise it'll be assumed to be an expression that evaluates to a valid key and passed on to append (as before). The main difference here is that now this macro can no longer panic :)

(@object $object:ident [$key:literal] ($value:expr) , $($rest:tt)*) => {{
$object.append($crate::raw::cstr!($key), $value);
$crate::rawbson!(@object $object () ($($rest)*) ($($rest)*));
}};

// Insert the current entry with followed by trailing comma, with a key expression.
(@object $object:ident [$($key:tt)+] ($value:expr) , $($rest:tt)*) => {{
$object.append($($key)+, $value);
$crate::rawbson!(@object $object () ($($rest)*) ($($rest)*));
}};

// Insert the last entry without trailing comma, with a key literal.
(@object $object:ident [$key:literal] ($value:expr)) => {
$object.append($crate::raw::cstr!($key), $value);
};

// Insert the last entry without trailing comma.
// Insert the last entry without trailing comma, with a key expression.
(@object $object:ident [$($key:tt)+] ($value:expr)) => {
$object.append(($($key)+), $value).expect("invalid bson value");
$object.append($($key)+, $value);
};

// Next value is `null`.
Expand Down
14 changes: 2 additions & 12 deletions src/raw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ mod array;
mod array_buf;
mod bson;
mod bson_ref;
mod cstr;
mod document;
mod document_buf;
mod iter;
Expand All @@ -142,6 +143,7 @@ pub use self::{
RawJavaScriptCodeWithScopeRef,
RawRegexRef,
},
cstr::{assert_valid_cstr, cstr, validate_cstr, CStr, CString, IsValidCStr},
document::RawDocument,
document_buf::{BindRawBsonRef, BindValue, RawDocumentBuf},
iter::{RawElement, RawIter},
Expand Down Expand Up @@ -316,15 +318,3 @@ pub(crate) fn write_string(buf: &mut Vec<u8>, s: &str) {
buf.extend(s.as_bytes());
buf.push(0);
}

pub(crate) fn write_cstring(buf: &mut Vec<u8>, s: &str) -> Result<()> {
if s.contains('\0') {
return Err(Error::malformed_bytes(format!(
"cstring with interior null: {:?}",
s
)));
}
buf.extend(s.as_bytes());
buf.push(0);
Ok(())
}
Loading