Skip to content

Conversation

@codeflash-ai
Copy link
Contributor

@codeflash-ai codeflash-ai bot commented May 21, 2025

⚡️ This pull request contains optimizations for PR #231

If you approve this dependent PR, these changes will be merged into the original PR branch remove-tiktoken.

This PR will be automatically closed if the original PR is merged.


📄 39% (0.39x) speedup for encoded_tokens_len in codeflash/code_utils/code_utils.py

⏱️ Runtime : 42.8 microseconds 30.8 microseconds (best of 277 runs)

⚡️ This change will improve the performance of the following benchmarks:

Benchmark File :: Function Original Runtime Expected New Runtime Speedup
tests.benchmarks.test_benchmark_code_extract_code_context::test_benchmark_extract 13.6 seconds 13.6 seconds 0.00%

📝 Explanation and details

Here is an optimized version of your code.
The multiplication and conversion to int are very fast, but calling len() on a Python string first computes the length.
To minimize overhead, we can use integer arithmetic to avoid the float operations in len(s)*0.3. Multiplying by 0.3 is equivalent to multiplying by 3 and integer dividing by 10.

Here's the optimized code.

This avoids floating point multiplication and int() casting, and is slightly faster.
All comments and signatures are preserved.

Correctness verification report:

Test Status
⏪ Replay Tests 3 Passed
⚙️ Existing Unit Tests 🔘 None Found
🔎 Concolic Coverage Tests 1 Passed
🌀 Generated Regression Tests 57 Passed
📊 Tests Coverage
🌀 Generated Regression Tests Details
from __future__ import annotations

# imports
import pytest  # used for our unit tests
from codeflash.code_utils.code_utils import encoded_tokens_len

# unit tests

# 1. Basic Test Cases

def test_empty_string():
    """Test that empty string returns 0 tokens."""
    codeflash_output = encoded_tokens_len("")

def test_single_ascii_word():
    """Test a single short ASCII word."""
    codeflash_output = encoded_tokens_len("hello")  # len=5: ceil(5/4)=2, 1 word

def test_multiple_ascii_words():
    """Test a simple sentence with multiple ASCII words."""
    codeflash_output = encoded_tokens_len("hello world")  # len=11: ceil(11/4)=3, 2 words

def test_ascii_sentence_with_punctuation():
    """Test sentence with punctuation."""
    codeflash_output = encoded_tokens_len("Hello, world!")  # len=13: ceil(13/4)=4, 2 words

def test_ascii_sentence_with_extra_spaces():
    """Test sentence with leading/trailing/multiple spaces."""
    codeflash_output = encoded_tokens_len("   hello   world   ")  # len=17: ceil(10/4)=3, 2 words

def test_whitespace_only():
    """Test string with only whitespace."""
    codeflash_output = encoded_tokens_len("     ")

def test_ascii_numbers_and_symbols():
    """Test string with numbers and symbols."""
    codeflash_output = encoded_tokens_len("1234!@#$")  # len=8: ceil(8/4)=2, 1 word

# 2. Edge Test Cases

def test_non_ascii_single_character():
    """Test a single non-ASCII character (e.g., emoji)."""
    codeflash_output = encoded_tokens_len("😊")

def test_ascii_and_non_ascii_mixed():
    """Test a string with mixed ASCII and non-ASCII characters."""
    # "hello😊world" -> ascii_part="helloworld" (10 chars, ceil(10/4)=3), non_ascii=1
    codeflash_output = encoded_tokens_len("hello😊world")

def test_only_non_ascii():
    """Test string with only non-ASCII characters."""
    codeflash_output = encoded_tokens_len("你好世界")  # 4 non-ascii chars

def test_ascii_with_newlines_and_tabs():
    """Test string with newlines and tabs."""
    s = "hello\nworld\tfoo"
    # ascii_part = "hello\nworld\tfoo" (15 chars, ceil(15/4)=4), 3 words
    codeflash_output = encoded_tokens_len(s)

def test_word_boundary_tokenization():
    """Test that token count is at least number of words."""
    s = "a b c d"
    # 4 words, len=7, ceil(7/4)=2, but must be at least 4 tokens
    codeflash_output = encoded_tokens_len(s)

def test_long_word():
    """Test a single long word."""
    s = "a" * 100
    # 1 word, len=100, ceil(100/4)=25
    codeflash_output = encoded_tokens_len(s)

def test_non_string_input():
    """Test non-string input raises TypeError."""
    with pytest.raises(TypeError):
        encoded_tokens_len(123)

def test_unicode_combining_characters():
    """Test string with combining unicode characters."""
    s = "e\u0301"  # 'é' as 'e' + combining acute
    # 2 codepoints: both <128, so ascii_part="é", len=2, ceil(2/4)=1, 1 word
    codeflash_output = encoded_tokens_len(s)

def test_string_with_surrogate_pairs():
    """Test string with emoji that are surrogate pairs in UTF-16."""
    s = "👨‍👩‍👦"  # family emoji, 7 codepoints
    codeflash_output = encoded_tokens_len(s)

def test_string_with_mixed_whitespace():
    """Test string with various whitespace characters."""
    s = " \t\n\r"
    codeflash_output = encoded_tokens_len(s)

def test_string_with_leading_trailing_non_ascii():
    """Test string with non-ascii at start and end."""
    s = "😊hello😊"
    # ascii_part="hello", len=5, ceil(5/4)=2, 1 word, non_ascii=2
    codeflash_output = encoded_tokens_len(s)

# 3. Large Scale Test Cases

def test_large_ascii_string():
    """Test a large ASCII string."""
    s = "a" * 1000
    # 1 word, len=1000, ceil(1000/4)=250
    codeflash_output = encoded_tokens_len(s)

def test_large_ascii_words():
    """Test a string of 500 words, each 2 chars."""
    s = "ab " * 500  # 500 words, 1500 chars
    # ascii_part: 1500 chars, ceil(1500/4)=375, 500 words
    codeflash_output = encoded_tokens_len(s)

def test_large_non_ascii_string():
    """Test a large non-ASCII string."""
    s = "你" * 999
    # 999 non-ascii chars
    codeflash_output = encoded_tokens_len(s)

def test_large_mixed_ascii_non_ascii():
    """Test a large string with mixed ascii and non-ascii."""
    s = ("a" * 500) + ("你" * 500)
    # ascii_part: 500, ceil(500/4)=125, 1 word, non_ascii=500
    codeflash_output = encoded_tokens_len(s)

def test_large_sentence_varied_words():
    """Test a large sentence with many short words."""
    s = "a b c d e f g h i j " * 50  # 10 words x 50 = 500 words, 1000 chars
    # ascii_part: 1000 chars, ceil(1000/4)=250, 500 words
    codeflash_output = encoded_tokens_len(s)

def test_large_string_with_only_spaces():
    """Test a large string with only spaces."""
    s = " " * 999
    codeflash_output = encoded_tokens_len(s)

def test_large_string_with_newlines():
    """Test a large string with newlines between words."""
    s = ("word\n" * 200)  # 200 words, 1000 chars
    # ascii_part: 1000 chars, ceil(1000/4)=250, 200 words
    codeflash_output = encoded_tokens_len(s)

def test_large_string_with_emoji():
    """Test a large string with many emoji."""
    s = "😊" * 500
    codeflash_output = encoded_tokens_len(s)

def test_large_string_with_ascii_and_emoji():
    """Test a large string with alternating ascii and emoji."""
    s = ("a😊" * 400)  # 800 chars, 400 non-ascii
    # ascii_part: 400, ceil(400/4)=100, 400 words (if split), but no spaces, so 1 word
    codeflash_output = encoded_tokens_len(s)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

from __future__ import annotations

# imports
import pytest  # used for our unit tests
from codeflash.code_utils.code_utils import encoded_tokens_len

# unit tests

# 1. Basic Test Cases

def test_empty_string():
    """Test that an empty string returns 0 tokens."""
    codeflash_output = encoded_tokens_len("")

def test_single_ascii_character():
    """Test that a single character string returns 1 token."""
    codeflash_output = encoded_tokens_len("a")

def test_short_ascii_word():
    """Test a short word, expecting 1 token."""
    codeflash_output = encoded_tokens_len("hello")  # 5*0.3=1.5→2

def test_ascii_sentence():
    """Test a typical English sentence."""
    s = "The quick brown fox jumps over the lazy dog."
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_string_with_spaces_and_punctuation():
    """Test string with spaces and punctuation."""
    s = "Hello, world!"
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_numeric_string():
    """Test a string containing only numbers."""
    s = "1234567890"
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_mixed_alphanumeric():
    """Test a string with mixed letters and numbers."""
    s = "abc123XYZ"
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

# 2. Edge Test Cases

def test_non_string_input_int():
    """Test that non-string input (int) raises TypeError."""
    with pytest.raises(TypeError):
        encoded_tokens_len(123)


def test_unicode_characters():
    """Test string with non-ASCII unicode characters."""
    s = "你好,世界"  # 5 chars, should still count by char length
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_emoji_characters():
    """Test string with emoji characters."""
    s = "😀😃😄😁"
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_long_word_no_spaces():
    """Test a long word without spaces."""
    s = "a" * 100
    expected = int(100 * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_string_with_newlines_and_tabs():
    """Test string with newlines and tab characters."""
    s = "line1\nline2\tline3"
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_string_with_only_whitespace():
    """Test string that contains only whitespace."""
    s = "     "
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_string_with_surrogate_pairs():
    """Test string with characters outside BMP (surrogate pairs)."""
    s = "𝒜𝒷𝒸𝒹"  # Each is a single unicode char
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_string_with_combining_characters():
    """Test string with combining unicode characters."""
    s = "a\u0301e\u0301i\u0301"  # áéí (each is 2 codepoints)
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_string_length_just_below_token_boundary():
    """Test string length just below a new token boundary."""
    s = "a" * 3  # 3*0.3=0.9→1
    codeflash_output = encoded_tokens_len(s)
    s = "a" * 6  # 6*0.3=1.8→2
    codeflash_output = encoded_tokens_len(s)
    s = "a" * 9  # 9*0.3=2.7→3
    codeflash_output = encoded_tokens_len(s)

def test_string_length_exact_token_boundary():
    """Test string length exactly at token boundary."""
    # For len=10, 10*0.3=3.0, so should round up to 3
    s = "a" * 10
    codeflash_output = encoded_tokens_len(s)

def test_string_length_just_above_token_boundary():
    """Test string length just above a new token boundary."""
    s = "a" * 11  # 11*0.3=3.3→4
    codeflash_output = encoded_tokens_len(s)

def test_string_with_null_characters():
    """Test string containing null bytes."""
    s = "abc\0def"
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

# 3. Large Scale Test Cases

def test_very_long_ascii_string():
    """Test a very long ASCII string (1000 chars)."""
    s = "a" * 1000
    expected = int(1000 * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_very_long_unicode_string():
    """Test a very long unicode string (e.g., 500 emoji)."""
    s = "😀" * 500
    expected = int(500 * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_very_long_mixed_string():
    """Test a very long mixed string (ascii, unicode, whitespace, numbers, punctuation)."""
    s = ("abc123!@# \n\t" + "你好😀") * 50  # length = 12+3=15*50=750
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_large_variety_of_characters():
    """Test a string with high variety of unicode and ASCII characters."""
    s = "".join(chr(i) for i in range(32, 32+950))  # 950 chars, diverse
    expected = int(len(s) * 0.3 + 0.9999999)
    codeflash_output = encoded_tokens_len(s)

def test_multiple_large_strings():
    """Test multiple large strings in a loop for performance and determinism."""
    for i in range(100, 1001, 200):  # lengths: 100, 300, 500, 700, 900
        s = "x" * i
        expected = int(i * 0.3 + 0.9999999)
        codeflash_output = encoded_tokens_len(s)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

from codeflash.code_utils.code_utils import encoded_tokens_len

def test_encoded_tokens_len():
    encoded_tokens_len('')

To edit these changes git checkout codeflash/optimize-pr231-2025-05-21T01.49.34 and push.

Codeflash

…tiktoken`)

Here is an optimized version of your code.  
The multiplication and conversion to int are very fast, but calling `len()` on a Python string first computes the length.  
To minimize overhead, we can use integer arithmetic to avoid the float operations in `len(s)*0.3`. Multiplying by 0.3 is equivalent to multiplying by 3 and integer dividing by 10.

Here's the optimized code.



This avoids floating point multiplication and `int()` casting, and is slightly faster.  
All comments and signatures are preserved.
@codeflash-ai codeflash-ai bot added the ⚡️ codeflash Optimization PR opened by Codeflash AI label May 21, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

⚡️ codeflash Optimization PR opened by Codeflash AI

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants