|
| 1 | +use unicode_id::UnicodeID; |
| 2 | + |
| 3 | +/// Returns true if `c` is a valid character for an identifier start. |
| 4 | +fn is_valid_start(c: char) -> bool { |
| 5 | + c == '$' || c == '_' || c.is_ascii_alphabetic() || { |
| 6 | + if c.is_ascii() { |
| 7 | + false |
| 8 | + } else { |
| 9 | + UnicodeID::is_id_start(c) |
| 10 | + } |
| 11 | + } |
| 12 | +} |
| 13 | + |
| 14 | +/// Returns true if `c` is a valid character for an identifier part after start. |
| 15 | +fn is_valid_continue(c: char) -> bool { |
| 16 | + // As specified by the ECMA-262 spec, U+200C (ZERO WIDTH NON-JOINER) and U+200D |
| 17 | + // (ZERO WIDTH JOINER) are format-control characters that are used to make necessary |
| 18 | + // distinctions when forming words or phrases in certain languages. They are however |
| 19 | + // not considered by UnicodeID to be universally valid identifier characters. |
| 20 | + c == '$' || c == '_' || c == '\u{200c}' || c == '\u{200d}' || c.is_ascii_alphanumeric() || { |
| 21 | + if c.is_ascii() { |
| 22 | + false |
| 23 | + } else { |
| 24 | + UnicodeID::is_id_continue(c) |
| 25 | + } |
| 26 | + } |
| 27 | +} |
| 28 | + |
| 29 | +fn strip_identifier(s: &str) -> Option<&str> { |
| 30 | + let mut iter = s.char_indices(); |
| 31 | + // Is the first character a valid starting character |
| 32 | + match iter.next() { |
| 33 | + Some((_, c)) => { |
| 34 | + if !is_valid_start(c) { |
| 35 | + return None; |
| 36 | + } |
| 37 | + } |
| 38 | + None => { |
| 39 | + return None; |
| 40 | + } |
| 41 | + }; |
| 42 | + // Slice up to the last valid continuation character |
| 43 | + let mut end_idx = 0; |
| 44 | + for (i, c) in iter { |
| 45 | + if is_valid_continue(c) { |
| 46 | + end_idx = i; |
| 47 | + } else { |
| 48 | + break; |
| 49 | + } |
| 50 | + } |
| 51 | + Some(&s[..=end_idx]) |
| 52 | +} |
| 53 | + |
| 54 | +pub fn is_valid_javascript_identifier(s: &str) -> bool { |
| 55 | + // check stripping does not reduce the length of the token |
| 56 | + strip_identifier(s).map_or(0, |t| t.len()) == s.len() |
| 57 | +} |
| 58 | + |
| 59 | +/// Finds the first valid identifier in the JS Source string given, provided |
| 60 | +/// the string begins with the identifier or whitespace. |
| 61 | +pub fn get_javascript_token(source_line: &str) -> Option<&str> { |
| 62 | + match source_line.split_whitespace().next() { |
| 63 | + Some(s) => strip_identifier(s), |
| 64 | + None => None, |
| 65 | + } |
| 66 | +} |
| 67 | + |
| 68 | +#[test] |
| 69 | +fn test_is_valid_javascript_identifier() { |
| 70 | + // assert_eq!(is_valid_javascript_identifier("foo 123")); |
| 71 | + assert!(is_valid_javascript_identifier("foo_$123")); |
| 72 | + assert!(!is_valid_javascript_identifier(" foo")); |
| 73 | + assert!(!is_valid_javascript_identifier("foo ")); |
| 74 | + assert!(!is_valid_javascript_identifier("[123]")); |
| 75 | + assert!(!is_valid_javascript_identifier("foo.bar")); |
| 76 | + // Should these pass? |
| 77 | + // assert!(is_valid_javascript_identifier("foo [bar]")); |
| 78 | + // assert!(is_valid_javascript_identifier("foo[bar]")); |
| 79 | + |
| 80 | + assert_eq!(get_javascript_token("foo "), Some("foo")); |
| 81 | + assert_eq!(get_javascript_token("f _hi"), Some("f")); |
| 82 | + assert_eq!(get_javascript_token("foo.bar"), Some("foo")); |
| 83 | + assert_eq!(get_javascript_token("[foo,bar]"), None); |
| 84 | +} |
0 commit comments