From 931fec7d29803154499a64335aff748bb5917476 Mon Sep 17 00:00:00 2001 From: Will Stott Date: Thu, 17 Nov 2022 22:22:38 +0000 Subject: [PATCH 1/6] Ignore no files in an unbundled bundle, avoid regex for jsbundle code --- src/ram_bundle.rs | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/ram_bundle.rs b/src/ram_bundle.rs index 0d3e1aaf..234d4447 100644 --- a/src/ram_bundle.rs +++ b/src/ram_bundle.rs @@ -1,5 +1,4 @@ //! RAM bundle operations -use regex::Regex; use scroll::Pread; use std::borrow::Cow; use std::collections::BTreeMap; @@ -192,6 +191,16 @@ impl<'a> RamBundle<'a> { } } +/// Filename must be made of ascii-only digits and the .js extension +/// Anything else errors with `Error::InvalidRamBundleIndex` +fn js_filename_to_index_strict(filename: &str) -> Result { + match filename.rsplit_once(".js") { + Some((basename, "")) => basename + .parse::() + .or(Err(Error::InvalidRamBundleIndex)), + _ => Err(Error::InvalidRamBundleIndex), + } +} /// Represents a file RAM bundle /// /// This RAM bundle type is mostly used on Android. @@ -217,7 +226,6 @@ impl UnbundleRamBundle { let mut max_module_id = 0; let mut modules: BTreeMap> = Default::default(); - let module_regex = Regex::new(r"^(\d+)\.js$").unwrap(); let js_modules_dir = bundle_dir.join(JS_MODULES_DIR_NAME); for entry in js_modules_dir.read_dir()? { @@ -229,15 +237,10 @@ impl UnbundleRamBundle { let path = entry.path(); let filename_os = path.file_name().unwrap(); let filename: &str = &filename_os.to_string_lossy(); - let module_id = match module_regex.captures(filename) { - Some(captures) => { - let module_string = captures.get(1).unwrap().as_str(); - module_string - .parse::() - .or(Err(Error::InvalidRamBundleIndex))? - } - None => continue, - }; + if filename == "UNBUNDLE" { + continue; + } + let module_id = js_filename_to_index_strict(filename)?; if module_id > max_module_id { max_module_id = module_id; } From 52c2d2054c5dfb323b6f120f443e25a091674fdc Mon Sep 17 00:00:00 2001 From: Will Stott Date: Thu, 17 Nov 2022 21:56:06 +0000 Subject: [PATCH 2/6] Hand-roll JS identifier utils - based on swc's parser --- Cargo.toml | 1 + src/js_identifiers.rs | 121 ++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + src/sourceview.rs | 2 +- src/utils.rs | 30 ----------- 5 files changed, 124 insertions(+), 31 deletions(-) create mode 100644 src/js_identifiers.rs diff --git a/Cargo.toml b/Cargo.toml index 374f5d0b..c5d3ebec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ serde_json = "1.0.48" base64 = "0.13.0" regex = "1.3.4" lazy_static = "1.4.0" +unicode-id = "0.3" if_chain = "1.0.0" scroll = { version = "0.10.1", features = ["derive"], optional = true } diff --git a/src/js_identifiers.rs b/src/js_identifiers.rs new file mode 100644 index 00000000..e125b2dd --- /dev/null +++ b/src/js_identifiers.rs @@ -0,0 +1,121 @@ +use lazy_static::lazy_static; +use regex::Regex; +use unicode_id::UnicodeID; + +lazy_static! { + static ref ANCHORED_IDENT_RE: Regex = Regex::new( + r#"(?x) + ^ + \s* + ([\d\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_] + [\d\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}\p{Mn}\p{Mc}\p{Nd}\p{Pc}$_]*) + "# + ) + .unwrap(); +} + +fn is_valid_javascript_identifier_original(s: &str) -> bool { + // check explicitly we do not have a dot in this identifier so that + // we do not match on foo.bar + s.trim() == s && !s.contains('.') && ANCHORED_IDENT_RE.is_match(s) +} + +fn get_javascript_token_original(source_line: &str) -> Option<&str> { + if let Some(m) = ANCHORED_IDENT_RE.captures(source_line) { + let rng = m.get(1).unwrap(); + Some(&source_line[rng.start()..rng.end()]) + } else { + None + } +} + +/// Returns true if `c` is a valid character for an identifier start. +fn is_valid_start(c: char) -> bool { + c == '$' || c == '_' || c.is_ascii_alphabetic() || { + if c.is_ascii() { + false + } else { + UnicodeID::is_id_start(c) + } + } +} + +/// Returns true if `c` is a valid character for an identifier part after +/// start. +fn is_valid_continue(c: char) -> bool { + c == '$' || c == '_' || c == '\u{200c}' || c == '\u{200d}' || c.is_ascii_alphanumeric() || { + if c.is_ascii() { + false + } else { + UnicodeID::is_id_continue(c) + } + } +} + +fn strip_to_js_token(s: &str) -> Option<&str> { + let mut iter = s.char_indices(); + // Is the first character a valid starting character + match iter.next() { + Some((_, c)) => { + if !is_valid_start(c) { + return None; + } + } + None => { + return None; + } + }; + // Slice up to the last valid continuation character + let mut end_idx = 0; + for (i, c) in iter { + if is_valid_continue(c) { + end_idx = i; + } else { + break; + } + } + return Some(&s[..=end_idx]); +} + +pub fn is_valid_javascript_identifier(s: &str) -> bool { + // check explicitly we do not have a dot in this identifier so that + // we do not match on foo.bar + !s.contains('.') && strip_to_js_token(s).map_or(0, |t| t.len()) == s.len() +} + +pub fn get_javascript_token(source_line: &str) -> Option<&str> { + match source_line.split_whitespace().next() { + Some(s) => strip_to_js_token(s), + None => None, + } +} + +#[test] +fn test_is_valid_javascript_identifier() { + assert_eq!(is_valid_javascript_identifier_original("foo 123"), true); + // assert_eq!(is_valid_javascript_identifier("foo 123"), true); + assert_eq!(is_valid_javascript_identifier_original("foo_$123"), true); + assert_eq!(is_valid_javascript_identifier("foo_$123"), true); + assert_eq!(is_valid_javascript_identifier_original(" foo"), false); + assert_eq!(is_valid_javascript_identifier(" foo"), false); + assert_eq!(is_valid_javascript_identifier_original("foo "), false); + assert_eq!(is_valid_javascript_identifier("foo "), false); + assert_eq!(is_valid_javascript_identifier_original("[123]"), false); + assert_eq!(is_valid_javascript_identifier("[123]"), false); + assert_eq!(is_valid_javascript_identifier_original("foo.bar"), false); + assert_eq!(is_valid_javascript_identifier("foo.bar"), false); + // Should these pass? + assert_eq!(is_valid_javascript_identifier_original("foo [bar]"), true); + // assert_eq!(is_valid_javascript_identifier("foo [bar]"), true); + assert_eq!(is_valid_javascript_identifier_original("foo[bar]"), true); + // assert_eq!(is_valid_javascript_identifier("foo[bar]"), true); + + assert_eq!(get_javascript_token_original("foo "), Some("foo")); + assert_eq!(get_javascript_token("foo "), Some("foo")); + assert_eq!(get_javascript_token_original("f _hi"), Some("f")); + assert_eq!(get_javascript_token("f _hi"), Some("f")); + assert_eq!(get_javascript_token_original("foo.bar"), Some("foo")); + assert_eq!(get_javascript_token("foo.bar"), Some("foo")); + assert_eq!(get_javascript_token_original("[foo,bar]"), None); + assert_eq!(get_javascript_token("[foo,bar]"), None); +} diff --git a/src/lib.rs b/src/lib.rs index 03f5521c..ab37229c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -69,6 +69,7 @@ mod detector; mod encoder; mod errors; mod hermes; +mod js_identifiers; mod jsontypes; mod sourceview; mod types; diff --git a/src/sourceview.rs b/src/sourceview.rs index d18c50b0..946c0d92 100644 --- a/src/sourceview.rs +++ b/src/sourceview.rs @@ -8,8 +8,8 @@ use if_chain::if_chain; use crate::detector::{locate_sourcemap_reference_slice, SourceMapRef}; use crate::errors::Result; +use crate::js_identifiers::{get_javascript_token, is_valid_javascript_identifier}; use crate::types::{idx_from_token, sourcemap_from_token, Token}; -use crate::utils::{get_javascript_token, is_valid_javascript_identifier}; /// An iterator that iterates over tokens in reverse. pub struct RevTokenIter<'view, 'viewbase, 'map> diff --git a/src/utils.rs b/src/utils.rs index 100ff7df..8bd74327 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,36 +1,6 @@ use std::borrow::Cow; use std::iter::repeat; -use lazy_static::lazy_static; -use regex::Regex; - -lazy_static! { - static ref ANCHORED_IDENT_RE: Regex = Regex::new( - r#"(?x) - ^ - \s* - ([\d\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_] - [\d\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}\p{Mn}\p{Mc}\p{Nd}\p{Pc}$_]*) - "# - ) - .unwrap(); -} - -pub fn is_valid_javascript_identifier(s: &str) -> bool { - // check explicitly we do not have a dot in this identifier so that - // we do not match on foo.bar - s.trim() == s && !s.contains('.') && ANCHORED_IDENT_RE.is_match(s) -} - -pub fn get_javascript_token(source_line: &str) -> Option<&str> { - if let Some(m) = ANCHORED_IDENT_RE.captures(source_line) { - let rng = m.get(1).unwrap(); - Some(&source_line[rng.start()..rng.end()]) - } else { - None - } -} - fn split_path(path: &str) -> Vec<&str> { let mut last_idx = 0; let mut rv = vec![]; From 860193c316d51578ae2d043f74ee0bc7c74903e1 Mon Sep 17 00:00:00 2001 From: Will Stott Date: Thu, 17 Nov 2022 22:33:04 +0000 Subject: [PATCH 3/6] Remove regex and lazy_static dependencies --- Cargo.toml | 2 -- src/js_identifiers.rs | 41 ----------------------------------------- 2 files changed, 43 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c5d3ebec..1b289d1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,8 +27,6 @@ url = "2.1.1" serde = { version = "1.0.104", features = ["derive"] } serde_json = "1.0.48" base64 = "0.13.0" -regex = "1.3.4" -lazy_static = "1.4.0" unicode-id = "0.3" if_chain = "1.0.0" scroll = { version = "0.10.1", features = ["derive"], optional = true } diff --git a/src/js_identifiers.rs b/src/js_identifiers.rs index e125b2dd..21987c1f 100644 --- a/src/js_identifiers.rs +++ b/src/js_identifiers.rs @@ -1,34 +1,5 @@ -use lazy_static::lazy_static; -use regex::Regex; use unicode_id::UnicodeID; -lazy_static! { - static ref ANCHORED_IDENT_RE: Regex = Regex::new( - r#"(?x) - ^ - \s* - ([\d\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_] - [\d\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}\p{Mn}\p{Mc}\p{Nd}\p{Pc}$_]*) - "# - ) - .unwrap(); -} - -fn is_valid_javascript_identifier_original(s: &str) -> bool { - // check explicitly we do not have a dot in this identifier so that - // we do not match on foo.bar - s.trim() == s && !s.contains('.') && ANCHORED_IDENT_RE.is_match(s) -} - -fn get_javascript_token_original(source_line: &str) -> Option<&str> { - if let Some(m) = ANCHORED_IDENT_RE.captures(source_line) { - let rng = m.get(1).unwrap(); - Some(&source_line[rng.start()..rng.end()]) - } else { - None - } -} - /// Returns true if `c` is a valid character for an identifier start. fn is_valid_start(c: char) -> bool { c == '$' || c == '_' || c.is_ascii_alphabetic() || { @@ -92,30 +63,18 @@ pub fn get_javascript_token(source_line: &str) -> Option<&str> { #[test] fn test_is_valid_javascript_identifier() { - assert_eq!(is_valid_javascript_identifier_original("foo 123"), true); // assert_eq!(is_valid_javascript_identifier("foo 123"), true); - assert_eq!(is_valid_javascript_identifier_original("foo_$123"), true); assert_eq!(is_valid_javascript_identifier("foo_$123"), true); - assert_eq!(is_valid_javascript_identifier_original(" foo"), false); assert_eq!(is_valid_javascript_identifier(" foo"), false); - assert_eq!(is_valid_javascript_identifier_original("foo "), false); assert_eq!(is_valid_javascript_identifier("foo "), false); - assert_eq!(is_valid_javascript_identifier_original("[123]"), false); assert_eq!(is_valid_javascript_identifier("[123]"), false); - assert_eq!(is_valid_javascript_identifier_original("foo.bar"), false); assert_eq!(is_valid_javascript_identifier("foo.bar"), false); // Should these pass? - assert_eq!(is_valid_javascript_identifier_original("foo [bar]"), true); // assert_eq!(is_valid_javascript_identifier("foo [bar]"), true); - assert_eq!(is_valid_javascript_identifier_original("foo[bar]"), true); // assert_eq!(is_valid_javascript_identifier("foo[bar]"), true); - assert_eq!(get_javascript_token_original("foo "), Some("foo")); assert_eq!(get_javascript_token("foo "), Some("foo")); - assert_eq!(get_javascript_token_original("f _hi"), Some("f")); assert_eq!(get_javascript_token("f _hi"), Some("f")); - assert_eq!(get_javascript_token_original("foo.bar"), Some("foo")); assert_eq!(get_javascript_token("foo.bar"), Some("foo")); - assert_eq!(get_javascript_token_original("[foo,bar]"), None); assert_eq!(get_javascript_token("[foo,bar]"), None); } From 758e1db17b3663a81cbf222d9666d388a14f0fc2 Mon Sep 17 00:00:00 2001 From: Will Stott Date: Thu, 17 Nov 2022 23:26:53 +0000 Subject: [PATCH 4/6] A little tidying up --- src/js_identifiers.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/js_identifiers.rs b/src/js_identifiers.rs index 21987c1f..5e6b78c9 100644 --- a/src/js_identifiers.rs +++ b/src/js_identifiers.rs @@ -23,7 +23,7 @@ fn is_valid_continue(c: char) -> bool { } } -fn strip_to_js_token(s: &str) -> Option<&str> { +fn strip_identifier(s: &str) -> Option<&str> { let mut iter = s.char_indices(); // Is the first character a valid starting character match iter.next() { @@ -49,14 +49,13 @@ fn strip_to_js_token(s: &str) -> Option<&str> { } pub fn is_valid_javascript_identifier(s: &str) -> bool { - // check explicitly we do not have a dot in this identifier so that - // we do not match on foo.bar - !s.contains('.') && strip_to_js_token(s).map_or(0, |t| t.len()) == s.len() + // check stripping does not reduce the length of the token + strip_identifier(s).map_or(0, |t| t.len()) == s.len() } pub fn get_javascript_token(source_line: &str) -> Option<&str> { match source_line.split_whitespace().next() { - Some(s) => strip_to_js_token(s), + Some(s) => strip_identifier(s), None => None, } } From ab4fefcd1706b9de021a8c44f51c691d7897635c Mon Sep 17 00:00:00 2001 From: Will Stott Date: Thu, 17 Nov 2022 23:31:52 +0000 Subject: [PATCH 5/6] Clarify changes to the un-bundling code --- src/ram_bundle.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ram_bundle.rs b/src/ram_bundle.rs index 234d4447..115e1e64 100644 --- a/src/ram_bundle.rs +++ b/src/ram_bundle.rs @@ -144,7 +144,8 @@ impl<'a> RamBundle<'a> { /// /// The provided path should point to a javascript file, that serves /// as an entry point (startup code) for the app. The modules are stored in js-modules/ - /// directory, next to the entry point. + /// directory, next to the entry point. The js-modules/ directory must ONLY contain + /// files with integer names and the ".js" file suffix, along with the UNBUNDLE magic file. pub fn parse_unbundle_from_path(bundle_path: &Path) -> Result { Ok(RamBundle { repr: RamBundleImpl::Unbundle(UnbundleRamBundle::parse(bundle_path)?), From dd78ef9fddbbe62b0c5bc8ebb65587674e04d70d Mon Sep 17 00:00:00 2001 From: Will Stott Date: Sat, 19 Nov 2022 16:42:24 +0000 Subject: [PATCH 6/6] Address comments from Clippy and Humans --- src/js_identifiers.rs | 27 ++++++++++++++++----------- src/ram_bundle.rs | 6 +++--- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/js_identifiers.rs b/src/js_identifiers.rs index 5e6b78c9..93caeeef 100644 --- a/src/js_identifiers.rs +++ b/src/js_identifiers.rs @@ -11,9 +11,12 @@ fn is_valid_start(c: char) -> bool { } } -/// Returns true if `c` is a valid character for an identifier part after -/// start. +/// Returns true if `c` is a valid character for an identifier part after start. fn is_valid_continue(c: char) -> bool { + // As specified by the ECMA-262 spec, U+200C (ZERO WIDTH NON-JOINER) and U+200D + // (ZERO WIDTH JOINER) are format-control characters that are used to make necessary + // distinctions when forming words or phrases in certain languages. They are however + // not considered by UnicodeID to be universally valid identifier characters. c == '$' || c == '_' || c == '\u{200c}' || c == '\u{200d}' || c.is_ascii_alphanumeric() || { if c.is_ascii() { false @@ -45,7 +48,7 @@ fn strip_identifier(s: &str) -> Option<&str> { break; } } - return Some(&s[..=end_idx]); + Some(&s[..=end_idx]) } pub fn is_valid_javascript_identifier(s: &str) -> bool { @@ -53,6 +56,8 @@ pub fn is_valid_javascript_identifier(s: &str) -> bool { strip_identifier(s).map_or(0, |t| t.len()) == s.len() } +/// Finds the first valid identifier in the JS Source string given, provided +/// the string begins with the identifier or whitespace. pub fn get_javascript_token(source_line: &str) -> Option<&str> { match source_line.split_whitespace().next() { Some(s) => strip_identifier(s), @@ -62,15 +67,15 @@ pub fn get_javascript_token(source_line: &str) -> Option<&str> { #[test] fn test_is_valid_javascript_identifier() { - // assert_eq!(is_valid_javascript_identifier("foo 123"), true); - assert_eq!(is_valid_javascript_identifier("foo_$123"), true); - assert_eq!(is_valid_javascript_identifier(" foo"), false); - assert_eq!(is_valid_javascript_identifier("foo "), false); - assert_eq!(is_valid_javascript_identifier("[123]"), false); - assert_eq!(is_valid_javascript_identifier("foo.bar"), false); + // assert_eq!(is_valid_javascript_identifier("foo 123")); + assert!(is_valid_javascript_identifier("foo_$123")); + assert!(!is_valid_javascript_identifier(" foo")); + assert!(!is_valid_javascript_identifier("foo ")); + assert!(!is_valid_javascript_identifier("[123]")); + assert!(!is_valid_javascript_identifier("foo.bar")); // Should these pass? - // assert_eq!(is_valid_javascript_identifier("foo [bar]"), true); - // assert_eq!(is_valid_javascript_identifier("foo[bar]"), true); + // assert!(is_valid_javascript_identifier("foo [bar]")); + // assert!(is_valid_javascript_identifier("foo[bar]")); assert_eq!(get_javascript_token("foo "), Some("foo")); assert_eq!(get_javascript_token("f _hi"), Some("f")); diff --git a/src/ram_bundle.rs b/src/ram_bundle.rs index 115e1e64..23f44017 100644 --- a/src/ram_bundle.rs +++ b/src/ram_bundle.rs @@ -195,11 +195,11 @@ impl<'a> RamBundle<'a> { /// Filename must be made of ascii-only digits and the .js extension /// Anything else errors with `Error::InvalidRamBundleIndex` fn js_filename_to_index_strict(filename: &str) -> Result { - match filename.rsplit_once(".js") { - Some((basename, "")) => basename + match filename.strip_suffix(".js") { + Some(basename) => basename .parse::() .or(Err(Error::InvalidRamBundleIndex)), - _ => Err(Error::InvalidRamBundleIndex), + None => Err(Error::InvalidRamBundleIndex), } } /// Represents a file RAM bundle