Skip to content

Commit 997be0f

Browse files
authored
Merge pull request #224 from codeflash-ai/remove-tiktoken
Remove tiktoken from the codebase
2 parents 3232228 + e54a6cd commit 997be0f

File tree

3 files changed

+10
-9
lines changed

3 files changed

+10
-9
lines changed

codeflash/code_utils/code_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010

1111
from codeflash.cli_cmds.console import logger
1212

13+
def encoded_tokens_len(s: str) -> int:
14+
'''Function for returning the approximate length of the encoded tokens
15+
It's an approximation of BPE encoding (https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)'''
16+
return len(s)//2
1317

1418
def get_qualified_name(module_name: str, full_qualified_name: str) -> str:
1519
if not full_qualified_name:

codeflash/context/code_context_extractor.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,12 @@
77

88
import jedi
99
import libcst as cst
10-
import tiktoken
1110
from jedi.api.classes import Name
1211
from libcst import CSTNode
1312

1413
from codeflash.cli_cmds.console import logger
1514
from codeflash.code_utils.code_extractor import add_needed_imports_from_module, find_preexisting_objects
16-
from codeflash.code_utils.code_utils import get_qualified_name, path_belongs_to_site_packages
15+
from codeflash.code_utils.code_utils import get_qualified_name, path_belongs_to_site_packages, encoded_tokens_len
1716
from codeflash.context.unused_definition_remover import remove_unused_definitions_by_function_names
1817
from codeflash.discovery.functions_to_optimize import FunctionToOptimize
1918
from codeflash.models.models import (
@@ -73,8 +72,7 @@ def get_code_optimization_context(
7372
)
7473

7574
# Handle token limits
76-
tokenizer = tiktoken.encoding_for_model("gpt-4o")
77-
final_read_writable_tokens = len(tokenizer.encode(final_read_writable_code))
75+
final_read_writable_tokens = encoded_tokens_len(final_read_writable_code)
7876
if final_read_writable_tokens > optim_token_limit:
7977
raise ValueError("Read-writable code has exceeded token limit, cannot proceed")
8078

@@ -87,7 +85,7 @@ def get_code_optimization_context(
8785
)
8886
read_only_context_code = read_only_code_markdown.markdown
8987

90-
read_only_code_markdown_tokens = len(tokenizer.encode(read_only_context_code))
88+
read_only_code_markdown_tokens = encoded_tokens_len(read_only_context_code)
9189
total_tokens = final_read_writable_tokens + read_only_code_markdown_tokens
9290
if total_tokens > optim_token_limit:
9391
logger.debug("Code context has exceeded token limit, removing docstrings from read-only code")
@@ -96,7 +94,7 @@ def get_code_optimization_context(
9694
helpers_of_fto_dict, helpers_of_helpers_dict, project_root_path, remove_docstrings=True
9795
)
9896
read_only_context_code = read_only_code_no_docstring_markdown.markdown
99-
read_only_code_no_docstring_markdown_tokens = len(tokenizer.encode(read_only_context_code))
97+
read_only_code_no_docstring_markdown_tokens = encoded_tokens_len(read_only_context_code)
10098
total_tokens = final_read_writable_tokens + read_only_code_no_docstring_markdown_tokens
10199
if total_tokens > optim_token_limit:
102100
logger.debug("Code context has exceeded token limit, removing read-only code")
@@ -111,7 +109,7 @@ def get_code_optimization_context(
111109
code_context_type=CodeContextType.TESTGEN,
112110
)
113111
testgen_context_code = testgen_code_markdown.code
114-
testgen_context_code_tokens = len(tokenizer.encode(testgen_context_code))
112+
testgen_context_code_tokens = encoded_tokens_len(testgen_context_code)
115113
if testgen_context_code_tokens > testgen_token_limit:
116114
testgen_code_markdown = extract_code_string_context_from_files(
117115
helpers_of_fto_dict,
@@ -121,7 +119,7 @@ def get_code_optimization_context(
121119
code_context_type=CodeContextType.TESTGEN,
122120
)
123121
testgen_context_code = testgen_code_markdown.code
124-
testgen_context_code_tokens = len(tokenizer.encode(testgen_context_code))
122+
testgen_context_code_tokens = encoded_tokens_len(testgen_context_code)
125123
if testgen_context_code_tokens > testgen_token_limit:
126124
raise ValueError("Testgen code context has exceeded token limit, cannot proceed")
127125

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ pytest = ">=7.0.0,!=8.3.4"
7373
gitpython = ">=3.1.31"
7474
libcst = ">=1.0.1"
7575
jedi = ">=0.19.1"
76-
tiktoken = ">=0.7.0"
7776
timeout-decorator = ">=0.5.0"
7877
pytest-timeout = ">=2.1.0"
7978
tomlkit = ">=0.11.7"

0 commit comments

Comments
 (0)