From e76e74cb55e8bf637238b951e1cd5afe1d4dfc6f Mon Sep 17 00:00:00 2001
From: Pavel Irzhauski <pavel.828554@huawei.com>
Date: Thu, 12 Jun 2025 18:36:22 +0000
Subject: [PATCH 1/4] optimize tokenizer using NEON for aarch64 similarly to
 using SSE2 for x86/x86_64

---
 html5ever/src/tokenizer/mod.rs | 203 +++++++++++++++++++++++++--------
 1 file changed, 153 insertions(+), 50 deletions(-)
diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
index 471e2e7c..98d2b61e 100644
--- a/html5ever/src/tokenizer/mod.rs
+++ b/html5ever/src/tokenizer/mod.rs
@@ -706,11 +706,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             states::Data => loop {
                 let set = small_char_set!('\r' '\0' '&' '<' '\n');
 
-                #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+                #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
                 let set_result = if !(self.opts.exact_errors
                     || self.reconsume.get()
                     || self.ignore_lf.get())
-                    && is_x86_feature_detected!("sse2")
+                    && Self::is_supported_simd_feature_detected()
                 {
                     let front_buffer = input.peek_front_chunk_mut();
                     let Some(mut front_buffer) = front_buffer else {
@@ -729,8 +729,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                         self.pop_except_from(input, set)
                     } else {
                         // SAFETY:
-                        // This CPU is guaranteed to support SSE2 due to the is_x86_feature_detected check above
-                        let result = unsafe { self.data_state_sse2_fast_path(&mut front_buffer) };
+                        // This CPU is guaranteed to support SIMD due to the is_supported_simd_feature_detected check above
+                        let result = unsafe { self.data_state_simd_fast_path(&mut front_buffer) };
 
                         if front_buffer.is_empty() {
                             drop(front_buffer);
@@ -743,7 +743,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     self.pop_except_from(input, set)
                 };
 
-                #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+                #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
                 let set_result = self.pop_except_from(input, set);
 
                 let Some(set_result) = set_result else {
@@ -1885,18 +1885,82 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
         }
     }
 
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    #[target_feature(enable = "sse2")]
+    /// Checks for supported SIMD feature, which is now either SSE2 for x86/x86_64 or NEON for aarch64
+    fn is_supported_simd_feature_detected() -> bool {
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        { is_x86_feature_detected!("sse2") }
+
+        #[cfg(target_arch = "aarch64")]
+        { std::arch::is_aarch64_feature_detected!("neon") }
+
+        #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
+        false
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"))]
     /// Implements the [data state] with SIMD instructions.
+    /// Calls SSE2- or NEON-specific function for chunks and processes any remaining bytes.
     ///
     /// The algorithm implemented is the naive SIMD approach described [here].
     ///
     /// ### SAFETY:
-    /// Calling this function on a CPU that does not support SSE2 causes undefined behaviour.
+    /// Calling this function on a CPU that supports neither SSE2 nor NEON causes undefined behaviour.
     ///
     /// [data state]: https://html.spec.whatwg.org/#data-state
     /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
-    unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
+    unsafe fn data_state_simd_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
+        let mut i = 0;
+        let mut n_newlines = 0;
+
+        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+        self.data_state_sse2_fast_path(input, &mut i, &mut n_newlines);
+
+        #[cfg(target_arch = "aarch64")]
+        self.data_state_neon_fast_path(input, &mut i, &mut n_newlines);
+
+        // Process any remaining bytes (less than STRIDE)
+        while let Some(c) = input.as_bytes().get(i) {
+            if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
+                break;
+            }
+            if *c == b'\n' {
+                n_newlines += 1;
+            }
+
+            i += 1;
+        }
+
+        let set_result = if i == 0 {
+            let first_char = input.pop_front_char().unwrap();
+            debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
+
+            // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
+            // Still, it would be nice to not have to do that.
+            // The same is true for the unwrap call.
+            let preprocessed_char = self
+                .get_preprocessed_char(first_char, &BufferQueue::default())
+                .unwrap();
+            SetResult::FromSet(preprocessed_char)
+        } else {
+            debug_assert!(
+                input.len() >= i,
+                "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
+                i,
+                input.len()
+            );
+            let consumed_chunk = input.unsafe_subtendril(0, i as u32);
+            input.unsafe_pop_front(i as u32);
+            SetResult::NotFromSet(consumed_chunk)
+        };
+
+        self.current_line.set(self.current_line.get() + n_newlines);
+
+        Some(set_result)
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[target_feature(enable = "sse2")]
+    unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril, i: &mut usize, n_newlines: &mut u64) {
         #[cfg(target_arch = "x86")]
         use std::arch::x86::{
             __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
@@ -1920,11 +1984,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
         let start = raw_bytes.as_ptr();
 
         const STRIDE: usize = 16;
-        let mut i = 0;
-        let mut n_newlines = 0;
-        while i + STRIDE <= raw_bytes.len() {
+        *i = 0;
+        *n_newlines = 0;
+        while *i + STRIDE <= raw_bytes.len() {
             // Load a 16 byte chunk from the input
-            let data = _mm_loadu_si128(start.add(i) as *const __m128i);
+            let data = _mm_loadu_si128(start.add(*i) as *const __m128i);
 
             // Compare the chunk against each mask
             let quotes = _mm_cmpeq_epi8(data, quote_mask);
@@ -1950,54 +2014,93 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     bitmask.leading_zeros() as usize
                 };
 
-                n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
-                i += position;
+                *n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
+                *i += position;
                 break;
             } else {
-                n_newlines += newline_mask.count_ones() as u64;
+                *n_newlines += newline_mask.count_ones() as u64;
             }
 
-            i += STRIDE;
+            *i += STRIDE;
         }
+    }
 
-        // Process any remaining bytes (less than STRIDE)
-        while let Some(c) = raw_bytes.get(i) {
-            if matches!(*c, b'<' | b'&' | b'\r' | b'\0') {
-                break;
-            }
-            if *c == b'\n' {
-                n_newlines += 1;
-            }
+    #[cfg(target_arch = "aarch64")]
+    #[target_feature(enable = "neon")]
+    /// Implements the [data state] with NEON SIMD instructions for AArch64.
+    ///
+    /// The algorithm implemented is the naive SIMD approach described [here].
+    ///
+    /// ### SAFETY:
+    /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
+    ///
+    /// [data state]: https://html.spec.whatwg.org/#data-state
+    /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
+    unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril, i: &mut usize, n_newlines: &mut u64) {
+        use std::arch::aarch64::{
+            vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8,
+        };
 
-            i += 1;
-        }
+        debug_assert!(!input.is_empty());
 
-        let set_result = if i == 0 {
-            let first_char = input.pop_front_char().unwrap();
-            debug_assert!(matches!(first_char, '<' | '&' | '\r' | '\0'));
+        let quote_mask = vdupq_n_u8(b'<');
+        let escape_mask = vdupq_n_u8(b'&');
+        let carriage_return_mask = vdupq_n_u8(b'\r');
+        let zero_mask = vdupq_n_u8(b'\0');
+        let newline_mask = vdupq_n_u8(b'\n');
 
-            // FIXME: Passing a bogus input queue is only relevant when c is \n, which can never happen in this case.
-            // Still, it would be nice to not have to do that.
-            // The same is true for the unwrap call.
-            let preprocessed_char = self
-                .get_preprocessed_char(first_char, &BufferQueue::default())
-                .unwrap();
-            SetResult::FromSet(preprocessed_char)
-        } else {
-            debug_assert!(
-                input.len() >= i,
-                "Trying to remove {:?} bytes from a tendril that is only {:?} bytes long",
-                i,
-                input.len()
-            );
-            let consumed_chunk = input.unsafe_subtendril(0, i as u32);
-            input.unsafe_pop_front(i as u32);
-            SetResult::NotFromSet(consumed_chunk)
-        };
+        let raw_bytes: &[u8] = input.as_bytes();
+        let start = raw_bytes.as_ptr();
 
-        self.current_line.set(self.current_line.get() + n_newlines);
+        const STRIDE: usize = 16;
+        *i = 0;
+        *n_newlines = 0;
+        while *i + STRIDE <= raw_bytes.len() {
+            // Load a 16 byte chunk from the input
+            let data = vld1q_u8(start.add(*i));
 
-        Some(set_result)
+            // Compare the chunk against each mask
+            let quotes = vceqq_u8(data, quote_mask);
+            let escapes = vceqq_u8(data, escape_mask);
+            let carriage_returns = vceqq_u8(data, carriage_return_mask);
+            let zeros = vceqq_u8(data, zero_mask);
+            let newlines = vceqq_u8(data, newline_mask);
+
+            // Combine all test results and create a bitmask from them.
+            // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
+            let test_result = vorrq_u8(
+                vorrq_u8(quotes, zeros),
+                vorrq_u8(escapes, carriage_returns),
+            );
+            let bitmask = vmaxvq_u8(test_result);
+            let newline_mask = vmaxvq_u8(newlines);
+            if bitmask != 0 {
+                // We have reached one of the characters that cause the state machine to transition
+                let chunk_bytes = std::slice::from_raw_parts(start.add(*i), STRIDE);
+                let position = chunk_bytes
+                    .iter()
+                    .position(|&b| matches!(b, b'<' | b'&' | b'\r' | b'\0'))
+                    .unwrap();
+
+                *n_newlines += chunk_bytes[..position]
+                    .iter()
+                    .filter(|&&b| b == b'\n')
+                    .count() as u64;
+
+                *i += position;
+                break;
+            } else {
+                if newline_mask != 0 {
+                    let chunk_bytes = std::slice::from_raw_parts(start.add(*i), STRIDE);
+                    *n_newlines += chunk_bytes
+                        .iter()
+                        .filter(|&&b| b == b'\n')
+                        .count() as u64;
+                }
+            }
+
+            *i += STRIDE;
+        }
     }
 }
 

From 6d481ed46c1de7b4cc089554b29358c19366361b Mon Sep 17 00:00:00 2001
From: Pavel Irzhauski <pavel.828554@huawei.com>
Date: Fri, 13 Jun 2025 17:44:18 +0000
Subject: [PATCH 2/4] refine NEON optimization

---
 html5ever/src/tokenizer/mod.rs | 64 +++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 29 deletions(-)

diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
index 98d2b61e..76813869 100644
--- a/html5ever/src/tokenizer/mod.rs
+++ b/html5ever/src/tokenizer/mod.rs
@@ -1885,7 +1885,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
         }
     }
 
-    /// Checks for supported SIMD feature, which is now either SSE2 for x86/x86_64 or NEON for aarch64
+    /// Checks for supported SIMD feature, which is now either SSE2 for x86/x86_64 or NEON for aarch64.
     fn is_supported_simd_feature_detected() -> bool {
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         { is_x86_feature_detected!("sse2") }
@@ -1909,14 +1909,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
     /// [data state]: https://html.spec.whatwg.org/#data-state
     /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
     unsafe fn data_state_simd_fast_path(&self, input: &mut StrTendril) -> Option<SetResult> {
-        let mut i = 0;
-        let mut n_newlines = 0;
-
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-        self.data_state_sse2_fast_path(input, &mut i, &mut n_newlines);
+        let (mut i, mut n_newlines) = self.data_state_sse2_fast_path(input);
 
         #[cfg(target_arch = "aarch64")]
-        self.data_state_neon_fast_path(input, &mut i, &mut n_newlines);
+        let (mut i, mut n_newlines) = self.data_state_neon_fast_path(input);
 
         // Process any remaining bytes (less than STRIDE)
         while let Some(c) = input.as_bytes().get(i) {
@@ -1960,7 +1957,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
 
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     #[target_feature(enable = "sse2")]
-    unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril, i: &mut usize, n_newlines: &mut u64) {
+    /// Implements the [data state] with SSE2 instructions for x86/x86_64.
+    /// Returns a pair of the number of bytes processed and the number of newlines found.
+    ///
+    /// ### SAFETY:
+    /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
+    ///
+    /// [data state]: https://html.spec.whatwg.org/#data-state
+    unsafe fn data_state_sse2_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
         #[cfg(target_arch = "x86")]
         use std::arch::x86::{
             __m128i, _mm_cmpeq_epi8, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
@@ -1984,11 +1988,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
         let start = raw_bytes.as_ptr();
 
         const STRIDE: usize = 16;
-        *i = 0;
-        *n_newlines = 0;
-        while *i + STRIDE <= raw_bytes.len() {
+        let mut i = 0;
+        let mut n_newlines = 0;
+        while i + STRIDE <= raw_bytes.len() {
             // Load a 16 byte chunk from the input
-            let data = _mm_loadu_si128(start.add(*i) as *const __m128i);
+            let data = _mm_loadu_si128(start.add(i) as *const __m128i);
 
             // Compare the chunk against each mask
             let quotes = _mm_cmpeq_epi8(data, quote_mask);
@@ -2014,29 +2018,29 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     bitmask.leading_zeros() as usize
                 };
 
-                *n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
-                *i += position;
+                n_newlines += (newline_mask & ((1 << position) - 1)).count_ones() as u64;
+                i += position;
                 break;
             } else {
-                *n_newlines += newline_mask.count_ones() as u64;
+                n_newlines += newline_mask.count_ones() as u64;
             }
 
-            *i += STRIDE;
+            i += STRIDE;
         }
+
+        (i, n_newlines)
     }
 
     #[cfg(target_arch = "aarch64")]
     #[target_feature(enable = "neon")]
     /// Implements the [data state] with NEON SIMD instructions for AArch64.
-    ///
-    /// The algorithm implemented is the naive SIMD approach described [here].
+    /// Returns a pair of the number of bytes processed and the number of newlines found.
     ///
     /// ### SAFETY:
     /// Calling this function on a CPU that does not support NEON causes undefined behaviour.
     ///
     /// [data state]: https://html.spec.whatwg.org/#data-state
-    /// [here]: https://lemire.me/blog/2024/06/08/scan-html-faster-with-simd-instructions-chrome-edition/
-    unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril, i: &mut usize, n_newlines: &mut u64) {
+    unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
         use std::arch::aarch64::{
             vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8,
         };
@@ -2053,11 +2057,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
         let start = raw_bytes.as_ptr();
 
         const STRIDE: usize = 16;
-        *i = 0;
-        *n_newlines = 0;
-        while *i + STRIDE <= raw_bytes.len() {
+        let mut i = 0;
+        let mut n_newlines = 0;
+        while i + STRIDE <= raw_bytes.len() {
             // Load a 16 byte chunk from the input
-            let data = vld1q_u8(start.add(*i));
+            let data = vld1q_u8(start.add(i));
 
             // Compare the chunk against each mask
             let quotes = vceqq_u8(data, quote_mask);
@@ -2076,31 +2080,33 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             let newline_mask = vmaxvq_u8(newlines);
             if bitmask != 0 {
                 // We have reached one of the characters that cause the state machine to transition
-                let chunk_bytes = std::slice::from_raw_parts(start.add(*i), STRIDE);
+                let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
                 let position = chunk_bytes
                     .iter()
                     .position(|&b| matches!(b, b'<' | b'&' | b'\r' | b'\0'))
                     .unwrap();
 
-                *n_newlines += chunk_bytes[..position]
+                n_newlines += chunk_bytes[..position]
                     .iter()
                     .filter(|&&b| b == b'\n')
                     .count() as u64;
 
-                *i += position;
+                i += position;
                 break;
             } else {
                 if newline_mask != 0 {
-                    let chunk_bytes = std::slice::from_raw_parts(start.add(*i), STRIDE);
-                    *n_newlines += chunk_bytes
+                    let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
+                    n_newlines += chunk_bytes
                         .iter()
                         .filter(|&&b| b == b'\n')
                         .count() as u64;
                 }
             }
 
-            *i += STRIDE;
+            i += STRIDE;
         }
+
+        (i, n_newlines)
     }
 }
 

From 9cdd12e3f90e5aab4eb269ad21b563ce7be634b2 Mon Sep 17 00:00:00 2001
From: Pavel Irzhauski <pavel.828554@huawei.com>
Date: Fri, 13 Jun 2025 18:10:17 +0000
Subject: [PATCH 3/4] meet format check requirements

---
 html5ever/src/tokenizer/mod.rs | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
index 76813869..ad749b1d 100644
--- a/html5ever/src/tokenizer/mod.rs
+++ b/html5ever/src/tokenizer/mod.rs
@@ -743,7 +743,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     self.pop_except_from(input, set)
                 };
 
-                #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
+                #[cfg(not(any(
+                    target_arch = "x86",
+                    target_arch = "x86_64",
+                    target_arch = "aarch64"
+                )))]
                 let set_result = self.pop_except_from(input, set);
 
                 let Some(set_result) = set_result else {
@@ -1888,10 +1892,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
     /// Checks for supported SIMD feature, which is now either SSE2 for x86/x86_64 or NEON for aarch64.
     fn is_supported_simd_feature_detected() -> bool {
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-        { is_x86_feature_detected!("sse2") }
+        {
+            is_x86_feature_detected!("sse2")
+        }
 
         #[cfg(target_arch = "aarch64")]
-        { std::arch::is_aarch64_feature_detected!("neon") }
+        {
+            std::arch::is_aarch64_feature_detected!("neon")
+        }
 
         #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
         false
@@ -2041,9 +2049,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
     ///
     /// [data state]: https://html.spec.whatwg.org/#data-state
     unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
-        use std::arch::aarch64::{
-            vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8,
-        };
+        use std::arch::aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8};
 
         debug_assert!(!input.is_empty());
 
@@ -2072,10 +2078,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
 
             // Combine all test results and create a bitmask from them.
             // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
-            let test_result = vorrq_u8(
-                vorrq_u8(quotes, zeros),
-                vorrq_u8(escapes, carriage_returns),
-            );
+            let test_result =
+                vorrq_u8(vorrq_u8(quotes, zeros), vorrq_u8(escapes, carriage_returns));
             let bitmask = vmaxvq_u8(test_result);
             let newline_mask = vmaxvq_u8(newlines);
             if bitmask != 0 {
@@ -2096,10 +2100,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             } else {
                 if newline_mask != 0 {
                     let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
-                    n_newlines += chunk_bytes
-                        .iter()
-                        .filter(|&&b| b == b'\n')
-                        .count() as u64;
+                    n_newlines += chunk_bytes.iter().filter(|&&b| b == b'\n').count() as u64;
                 }
             }
 

From b1a53169fd3fd3d20c78aaaba7ca57923705e8c4 Mon Sep 17 00:00:00 2001
From: Pavel Irzhauski <pavel.828554@huawei.com>
Date: Fri, 13 Jun 2025 18:10:17 +0000
Subject: [PATCH 4/4] meet format check requirements

Signed-off-by: Pavel Irzhauski <pavel.828554@huawei.com>
---
 html5ever/src/tokenizer/mod.rs | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
index 76813869..ad749b1d 100644
--- a/html5ever/src/tokenizer/mod.rs
+++ b/html5ever/src/tokenizer/mod.rs
@@ -743,7 +743,11 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     self.pop_except_from(input, set)
                 };
 
-                #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
+                #[cfg(not(any(
+                    target_arch = "x86",
+                    target_arch = "x86_64",
+                    target_arch = "aarch64"
+                )))]
                 let set_result = self.pop_except_from(input, set);
 
                 let Some(set_result) = set_result else {
@@ -1888,10 +1892,14 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
     /// Checks for supported SIMD feature, which is now either SSE2 for x86/x86_64 or NEON for aarch64.
     fn is_supported_simd_feature_detected() -> bool {
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-        { is_x86_feature_detected!("sse2") }
+        {
+            is_x86_feature_detected!("sse2")
+        }
 
         #[cfg(target_arch = "aarch64")]
-        { std::arch::is_aarch64_feature_detected!("neon") }
+        {
+            std::arch::is_aarch64_feature_detected!("neon")
+        }
 
         #[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
         false
@@ -2041,9 +2049,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
     ///
     /// [data state]: https://html.spec.whatwg.org/#data-state
     unsafe fn data_state_neon_fast_path(&self, input: &mut StrTendril) -> (usize, u64) {
-        use std::arch::aarch64::{
-            vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8,
-        };
+        use std::arch::aarch64::{vceqq_u8, vdupq_n_u8, vld1q_u8, vmaxvq_u8, vorrq_u8};
 
         debug_assert!(!input.is_empty());
 
@@ -2072,10 +2078,8 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
 
             // Combine all test results and create a bitmask from them.
             // Each bit in the mask will be 1 if the character at the bit position is in the set and 0 otherwise.
-            let test_result = vorrq_u8(
-                vorrq_u8(quotes, zeros),
-                vorrq_u8(escapes, carriage_returns),
-            );
+            let test_result =
+                vorrq_u8(vorrq_u8(quotes, zeros), vorrq_u8(escapes, carriage_returns));
             let bitmask = vmaxvq_u8(test_result);
             let newline_mask = vmaxvq_u8(newlines);
             if bitmask != 0 {
@@ -2096,10 +2100,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             } else {
                 if newline_mask != 0 {
                     let chunk_bytes = std::slice::from_raw_parts(start.add(i), STRIDE);
-                    n_newlines += chunk_bytes
-                        .iter()
-                        .filter(|&&b| b == b'\n')
-                        .count() as u64;
+                    n_newlines += chunk_bytes.iter().filter(|&&b| b == b'\n').count() as u64;
                 }
             }