Skip to content

Commit 5187edf

Browse files
authored
Remove regex dependency for faster runtime, and compile (#55)
1 parent a12a209 commit 5187edf

File tree

6 files changed

+103
-45
lines changed

6 files changed

+103
-45
lines changed

Cargo.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,7 @@ url = "2.1.1"
2727
serde = { version = "1.0.104", features = ["derive"] }
2828
serde_json = "1.0.48"
2929
base64 = "0.13.0"
30-
regex = "1.3.4"
31-
lazy_static = "1.4.0"
30+
unicode-id = "0.3"
3231
if_chain = "1.0.0"
3332
scroll = { version = "0.10.1", features = ["derive"], optional = true }
3433

src/js_identifiers.rs

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
use unicode_id::UnicodeID;
2+
3+
/// Returns true if `c` is a valid character for an identifier start.
4+
fn is_valid_start(c: char) -> bool {
5+
c == '$' || c == '_' || c.is_ascii_alphabetic() || {
6+
if c.is_ascii() {
7+
false
8+
} else {
9+
UnicodeID::is_id_start(c)
10+
}
11+
}
12+
}
13+
14+
/// Returns true if `c` is a valid character for an identifier part after start.
15+
fn is_valid_continue(c: char) -> bool {
16+
// As specified by the ECMA-262 spec, U+200C (ZERO WIDTH NON-JOINER) and U+200D
17+
// (ZERO WIDTH JOINER) are format-control characters that are used to make necessary
18+
// distinctions when forming words or phrases in certain languages. They are however
19+
// not considered by UnicodeID to be universally valid identifier characters.
20+
c == '$' || c == '_' || c == '\u{200c}' || c == '\u{200d}' || c.is_ascii_alphanumeric() || {
21+
if c.is_ascii() {
22+
false
23+
} else {
24+
UnicodeID::is_id_continue(c)
25+
}
26+
}
27+
}
28+
29+
fn strip_identifier(s: &str) -> Option<&str> {
30+
let mut iter = s.char_indices();
31+
// Is the first character a valid starting character
32+
match iter.next() {
33+
Some((_, c)) => {
34+
if !is_valid_start(c) {
35+
return None;
36+
}
37+
}
38+
None => {
39+
return None;
40+
}
41+
};
42+
// Slice up to the last valid continuation character
43+
let mut end_idx = 0;
44+
for (i, c) in iter {
45+
if is_valid_continue(c) {
46+
end_idx = i;
47+
} else {
48+
break;
49+
}
50+
}
51+
Some(&s[..=end_idx])
52+
}
53+
54+
pub fn is_valid_javascript_identifier(s: &str) -> bool {
55+
// check stripping does not reduce the length of the token
56+
strip_identifier(s).map_or(0, |t| t.len()) == s.len()
57+
}
58+
59+
/// Finds the first valid identifier in the JS Source string given, provided
60+
/// the string begins with the identifier or whitespace.
61+
pub fn get_javascript_token(source_line: &str) -> Option<&str> {
62+
match source_line.split_whitespace().next() {
63+
Some(s) => strip_identifier(s),
64+
None => None,
65+
}
66+
}
67+
68+
#[test]
69+
fn test_is_valid_javascript_identifier() {
70+
// assert_eq!(is_valid_javascript_identifier("foo 123"));
71+
assert!(is_valid_javascript_identifier("foo_$123"));
72+
assert!(!is_valid_javascript_identifier(" foo"));
73+
assert!(!is_valid_javascript_identifier("foo "));
74+
assert!(!is_valid_javascript_identifier("[123]"));
75+
assert!(!is_valid_javascript_identifier("foo.bar"));
76+
// Should these pass?
77+
// assert!(is_valid_javascript_identifier("foo [bar]"));
78+
// assert!(is_valid_javascript_identifier("foo[bar]"));
79+
80+
assert_eq!(get_javascript_token("foo "), Some("foo"));
81+
assert_eq!(get_javascript_token("f _hi"), Some("f"));
82+
assert_eq!(get_javascript_token("foo.bar"), Some("foo"));
83+
assert_eq!(get_javascript_token("[foo,bar]"), None);
84+
}

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ mod detector;
6969
mod encoder;
7070
mod errors;
7171
mod hermes;
72+
mod js_identifiers;
7273
mod jsontypes;
7374
mod sourceview;
7475
mod types;

src/ram_bundle.rs

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
//! RAM bundle operations
2-
use regex::Regex;
32
use scroll::Pread;
43
use std::borrow::Cow;
54
use std::collections::BTreeMap;
@@ -145,7 +144,8 @@ impl<'a> RamBundle<'a> {
145144
///
146145
/// The provided path should point to a javascript file, that serves
147146
/// as an entry point (startup code) for the app. The modules are stored in js-modules/
148-
/// directory, next to the entry point.
147+
/// directory, next to the entry point. The js-modules/ directory must ONLY contain
148+
/// files with integer names and the ".js" file suffix, along with the UNBUNDLE magic file.
149149
pub fn parse_unbundle_from_path(bundle_path: &Path) -> Result<Self> {
150150
Ok(RamBundle {
151151
repr: RamBundleImpl::Unbundle(UnbundleRamBundle::parse(bundle_path)?),
@@ -192,6 +192,16 @@ impl<'a> RamBundle<'a> {
192192
}
193193
}
194194

195+
/// Filename must be made of ascii-only digits and the .js extension
196+
/// Anything else errors with `Error::InvalidRamBundleIndex`
197+
fn js_filename_to_index_strict(filename: &str) -> Result<usize> {
198+
match filename.strip_suffix(".js") {
199+
Some(basename) => basename
200+
.parse::<usize>()
201+
.or(Err(Error::InvalidRamBundleIndex)),
202+
None => Err(Error::InvalidRamBundleIndex),
203+
}
204+
}
195205
/// Represents a file RAM bundle
196206
///
197207
/// This RAM bundle type is mostly used on Android.
@@ -217,7 +227,6 @@ impl UnbundleRamBundle {
217227
let mut max_module_id = 0;
218228
let mut modules: BTreeMap<usize, Vec<u8>> = Default::default();
219229

220-
let module_regex = Regex::new(r"^(\d+)\.js$").unwrap();
221230
let js_modules_dir = bundle_dir.join(JS_MODULES_DIR_NAME);
222231

223232
for entry in js_modules_dir.read_dir()? {
@@ -229,15 +238,10 @@ impl UnbundleRamBundle {
229238
let path = entry.path();
230239
let filename_os = path.file_name().unwrap();
231240
let filename: &str = &filename_os.to_string_lossy();
232-
let module_id = match module_regex.captures(filename) {
233-
Some(captures) => {
234-
let module_string = captures.get(1).unwrap().as_str();
235-
module_string
236-
.parse::<usize>()
237-
.or(Err(Error::InvalidRamBundleIndex))?
238-
}
239-
None => continue,
240-
};
241+
if filename == "UNBUNDLE" {
242+
continue;
243+
}
244+
let module_id = js_filename_to_index_strict(filename)?;
241245
if module_id > max_module_id {
242246
max_module_id = module_id;
243247
}

src/sourceview.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ use if_chain::if_chain;
88

99
use crate::detector::{locate_sourcemap_reference_slice, SourceMapRef};
1010
use crate::errors::Result;
11+
use crate::js_identifiers::{get_javascript_token, is_valid_javascript_identifier};
1112
use crate::types::{idx_from_token, sourcemap_from_token, Token};
12-
use crate::utils::{get_javascript_token, is_valid_javascript_identifier};
1313

1414
/// An iterator that iterates over tokens in reverse.
1515
pub struct RevTokenIter<'view, 'viewbase, 'map>

src/utils.rs

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,6 @@
11
use std::borrow::Cow;
22
use std::iter::repeat;
33

4-
use lazy_static::lazy_static;
5-
use regex::Regex;
6-
7-
lazy_static! {
8-
static ref ANCHORED_IDENT_RE: Regex = Regex::new(
9-
r#"(?x)
10-
^
11-
\s*
12-
([\d\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}$_]
13-
[\d\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}\p{Mn}\p{Mc}\p{Nd}\p{Pc}$_]*)
14-
"#
15-
)
16-
.unwrap();
17-
}
18-
19-
pub fn is_valid_javascript_identifier(s: &str) -> bool {
20-
// check explicitly we do not have a dot in this identifier so that
21-
// we do not match on foo.bar
22-
s.trim() == s && !s.contains('.') && ANCHORED_IDENT_RE.is_match(s)
23-
}
24-
25-
pub fn get_javascript_token(source_line: &str) -> Option<&str> {
26-
if let Some(m) = ANCHORED_IDENT_RE.captures(source_line) {
27-
let rng = m.get(1).unwrap();
28-
Some(&source_line[rng.start()..rng.end()])
29-
} else {
30-
None
31-
}
32-
}
33-
344
fn split_path(path: &str) -> Vec<&str> {
355
let mut last_idx = 0;
366
let mut rv = vec![];

0 commit comments

Comments
 (0)