Remove regex dependency for faster runtime, and compile (#55)

willstott101 · web-flow · commit 5187edf627d7 · 2022-11-21T12:05:43.000+01:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -27,8 +27,7 @@ url = "2.1.1"
 serde = { version = "1.0.104", features = ["derive"] }
 serde_json = "1.0.48"
 base64 = "0.13.0"
-regex = "1.3.4"
-lazy_static = "1.4.0"
+unicode-id  = "0.3"
 if_chain = "1.0.0"
 scroll = { version = "0.10.1", features = ["derive"], optional = true }
 
diff --git a/src/js_identifiers.rs b/src/js_identifiers.rs
@@ -0,0 +1,84 @@
+use unicode_id::UnicodeID;
+
+/// Returns true if `c` is a valid character for an identifier start.
+fn is_valid_start(c: char) -> bool {
+    c == '$' || c == '_' || c.is_ascii_alphabetic() || {
+        if c.is_ascii() {
+            false
+        } else {
+            UnicodeID::is_id_start(c)
+        }
+    }
+}
+
+/// Returns true if `c` is a valid character for an identifier part after start.
+fn is_valid_continue(c: char) -> bool {
+    // As specified by the ECMA-262 spec, U+200C (ZERO WIDTH NON-JOINER) and U+200D
+    // (ZERO WIDTH JOINER) are format-control characters that are used to make necessary
+    // distinctions when forming words or phrases in certain languages. They are however
+    // not considered by UnicodeID to be universally valid identifier characters.
+    c == '$' || c == '_' || c == '\u{200c}' || c == '\u{200d}' || c.is_ascii_alphanumeric() || {
+        if c.is_ascii() {
+            false
+        } else {
+            UnicodeID::is_id_continue(c)
+        }
+    }
+}
+
+fn strip_identifier(s: &str) -> Option<&str> {
+    let mut iter = s.char_indices();
+    // Is the first character a valid starting character
+    match iter.next() {
+        Some((_, c)) => {
+            if !is_valid_start(c) {
+                return None;
+            }
+        }
+        None => {
+            return None;
+        }
+    };
+    // Slice up to the last valid continuation character
+    let mut end_idx = 0;
+    for (i, c) in iter {
+        if is_valid_continue(c) {
+            end_idx = i;
+        } else {
+            break;
+        }
+    }
+    Some(&s[..=end_idx])
+}
+
+pub fn is_valid_javascript_identifier(s: &str) -> bool {
+    // check stripping does not reduce the length of the token
+    strip_identifier(s).map_or(0, |t| t.len()) == s.len()
+}
+
+/// Finds the first valid identifier in the JS Source string given, provided
+/// the string begins with the identifier or whitespace.
+pub fn get_javascript_token(source_line: &str) -> Option<&str> {
+    match source_line.split_whitespace().next() {
+        Some(s) => strip_identifier(s),
+        None => None,
+    }
+}
+
+#[test]
+fn test_is_valid_javascript_identifier() {
+    // assert_eq!(is_valid_javascript_identifier("foo 123"));
+    assert!(is_valid_javascript_identifier("foo_$123"));
+    assert!(!is_valid_javascript_identifier(" foo"));
+    assert!(!is_valid_javascript_identifier("foo "));
+    assert!(!is_valid_javascript_identifier("[123]"));
+    assert!(!is_valid_javascript_identifier("foo.bar"));
+    // Should these pass?
+    // assert!(is_valid_javascript_identifier("foo [bar]"));
+    // assert!(is_valid_javascript_identifier("foo[bar]"));
+
+    assert_eq!(get_javascript_token("foo "), Some("foo"));
+    assert_eq!(get_javascript_token("f _hi"), Some("f"));
+    assert_eq!(get_javascript_token("foo.bar"), Some("foo"));
+    assert_eq!(get_javascript_token("[foo,bar]"), None);
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -69,6 +69,7 @@ mod detector;
 mod encoder;
 mod errors;
 mod hermes;
+mod js_identifiers;
 mod jsontypes;
 mod sourceview;
 mod types;
diff --git a/src/ram_bundle.rs b/src/ram_bundle.rs
@@ -1,5 +1,4 @@
 //! RAM bundle operations
-use regex::Regex;
 use scroll::Pread;
 use std::borrow::Cow;
 use std::collections::BTreeMap;
@@ -145,7 +144,8 @@ impl<'a> RamBundle<'a> {
     ///
     /// The provided path should point to a javascript file, that serves
     /// as an entry point (startup code) for the app. The modules are stored in js-modules/
-    /// directory, next to the entry point.
+    /// directory, next to the entry point. The js-modules/ directory must ONLY contain
+    /// files with integer names and the ".js" file suffix, along with the UNBUNDLE magic file.
     pub fn parse_unbundle_from_path(bundle_path: &Path) -> Result<Self> {
         Ok(RamBundle {
             repr: RamBundleImpl::Unbundle(UnbundleRamBundle::parse(bundle_path)?),
@@ -192,6 +192,16 @@ impl<'a> RamBundle<'a> {
     }
 }
 
+/// Filename must be made of ascii-only digits and the .js extension
+/// Anything else errors with `Error::InvalidRamBundleIndex`
+fn js_filename_to_index_strict(filename: &str) -> Result<usize> {
+    match filename.strip_suffix(".js") {
+        Some(basename) => basename
+            .parse::<usize>()
+            .or(Err(Error::InvalidRamBundleIndex)),
+        None => Err(Error::InvalidRamBundleIndex),
+    }
+}
 /// Represents a file RAM bundle
 ///
 /// This RAM bundle type is mostly used on Android.
@@ -217,7 +227,6 @@ impl UnbundleRamBundle {
         let mut max_module_id = 0;
         let mut modules: BTreeMap<usize, Vec<u8>> = Default::default();
 
-        let module_regex = Regex::new(r"^(\d+)\.js$").unwrap();
         let js_modules_dir = bundle_dir.join(JS_MODULES_DIR_NAME);
 
         for entry in js_modules_dir.read_dir()? {
@@ -229,15 +238,10 @@ impl UnbundleRamBundle {
             let path = entry.path();
             let filename_os = path.file_name().unwrap();
             let filename: &str = &filename_os.to_string_lossy();
-            let module_id = match module_regex.captures(filename) {
-                Some(captures) => {
-                    let module_string = captures.get(1).unwrap().as_str();
-                    module_string
-                        .parse::<usize>()
-                        .or(Err(Error::InvalidRamBundleIndex))?
-                }
-                None => continue,
-            };
+            if filename == "UNBUNDLE" {
+                continue;
+            }
+            let module_id = js_filename_to_index_strict(filename)?;
             if module_id > max_module_id {
                 max_module_id = module_id;
             }
diff --git a/src/sourceview.rs b/src/sourceview.rs
@@ -8,8 +8,8 @@ use if_chain::if_chain;
 
 use crate::detector::{locate_sourcemap_reference_slice, SourceMapRef};
 use crate::errors::Result;
+use crate::js_identifiers::{get_javascript_token, is_valid_javascript_identifier};
 use crate::types::{idx_from_token, sourcemap_from_token, Token};
-use crate::utils::{get_javascript_token, is_valid_javascript_identifier};
 
 /// An iterator that iterates over tokens in reverse.
 pub struct RevTokenIter<'view, 'viewbase, 'map>
diff --git a/src/utils.rs b/src/utils.rs
@@ -1,36 +1,6 @@
 use std::borrow::Cow;
 use std::iter::repeat;
 
-use lazy_static::lazy_static;
-use regex::Regex;
-
-lazy_static! {
-    static ref ANCHORED_IDENT_RE: Regex = Regex::new(
-        r#"(?x)
-            ^
-            \s*
-            ([\d\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_]
-            [\d\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}\p{Mn}\p{Mc}\p{Nd}\p{Pc}$_]*)
-        "#
-    )
-    .unwrap();
-}
-
-pub fn is_valid_javascript_identifier(s: &str) -> bool {
-    // check explicitly we do not have a dot in this identifier so that
-    // we do not match on foo.bar
-    s.trim() == s && !s.contains('.') && ANCHORED_IDENT_RE.is_match(s)
-}
-
-pub fn get_javascript_token(source_line: &str) -> Option<&str> {
-    if let Some(m) = ANCHORED_IDENT_RE.captures(source_line) {
-        let rng = m.get(1).unwrap();
-        Some(&source_line[rng.start()..rng.end()])
-    } else {
-        None
-    }
-}
-
 fn split_path(path: &str) -> Vec<&str> {
     let mut last_idx = 0;
     let mut rv = vec![];