Merge branch 'main' into add-info-to-codeflash-all-docs

aseembits93 · web-flow · commit ec4afc4628dc · 2025-05-20T23:57:06.000-04:00
diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
@@ -1,10 +1,9 @@
 from __future__ import annotations
 
-import time
-
 import json
 import os
 import platform
+import time
 from typing import TYPE_CHECKING, Any
 
 import requests
@@ -122,7 +121,7 @@ def optimize_python_code(
             logger.info(f"Generated {len(optimizations_json)} candidates.")
             console.rule()
             end_time = time.perf_counter()
-            logger.debug(f"Optimization took {end_time - start_time:.2f} seconds.")
+            logger.debug(f"Generating optimizations took {end_time - start_time:.2f} seconds.")
             return [
                 OptimizedCandidate(
                     source_code=opt["source_code"],
@@ -177,7 +176,7 @@ def optimize_python_code_line_profiler(
 
         logger.info("Generating optimized candidates…")
         console.rule()
-        if line_profiler_results=="":
+        if line_profiler_results == "":
             logger.info("No LineProfiler results were provided, Skipping optimization.")
             console.rule()
             return []
@@ -209,7 +208,6 @@ def optimize_python_code_line_profiler(
         console.rule()
         return []
 
-
     def log_results(
         self,
         function_trace_id: str,
@@ -272,9 +270,10 @@ def generate_regression_tests(
         - Dict[str, str] | None: The generated regression tests and instrumented tests, or None if an error occurred.
 
         """
-        assert test_framework in ["pytest", "unittest"], (
-            f"Invalid test framework, got {test_framework} but expected 'pytest' or 'unittest'"
-        )
+        assert test_framework in [
+            "pytest",
+            "unittest",
+        ], f"Invalid test framework, got {test_framework} but expected 'pytest' or 'unittest'"
         payload = {
             "source_code_being_tested": source_code_being_tested,
             "function_to_optimize": function_to_optimize,
diff --git a/codeflash/cli_cmds/cmd_init.py b/codeflash/cli_cmds/cmd_init.py
@@ -239,7 +239,7 @@ def collect_setup_info() -> SetupInfo:
         else:
             apologize_and_exit()
     else:
-        tests_root = Path(curdir) / Path(cast(str, tests_root_answer))
+        tests_root = Path(curdir) / Path(cast("str", tests_root_answer))
     tests_root = tests_root.relative_to(curdir)
     ph("cli-tests-root-provided")
 
@@ -302,7 +302,7 @@ def collect_setup_info() -> SetupInfo:
     elif benchmarks_answer == no_benchmarks_option:
         benchmarks_root = None
     else:
-        benchmarks_root = tests_root / Path(cast(str, benchmarks_answer))
+        benchmarks_root = tests_root / Path(cast("str", benchmarks_answer))
 
     # TODO: Implement other benchmark framework options
     # if benchmarks_root:
@@ -354,9 +354,9 @@ def collect_setup_info() -> SetupInfo:
         module_root=str(module_root),
         tests_root=str(tests_root),
         benchmarks_root=str(benchmarks_root) if benchmarks_root else None,
-        test_framework=cast(str, test_framework),
+        test_framework=cast("str", test_framework),
         ignore_paths=ignore_paths,
-        formatter=cast(str, formatter),
+        formatter=cast("str", formatter),
         git_remote=str(git_remote),
     )
 
@@ -466,7 +466,7 @@ def check_for_toml_or_setup_file() -> str | None:
             click.echo("⏩️ Skipping pyproject.toml creation.")
             apologize_and_exit()
     click.echo()
-    return cast(str, project_name)
+    return cast("str", project_name)
 
 
 def install_github_actions(override_formatter_check: bool = False) -> None:
@@ -852,7 +852,8 @@ def enter_api_key_and_save_to_rc() -> None:
 
 
 def create_bubble_sort_file_and_test(args: Namespace) -> tuple[str, str]:
-    bubble_sort_content = """def sorter(arr):
+    bubble_sort_content = """from typing import Union, List
+def sorter(arr: Union[List[int],List[float]]) -> Union[List[int],List[float]]:
     for i in range(len(arr)):
         for j in range(len(arr) - 1):
             if arr[j] > arr[j + 1]:
diff --git a/codeflash/code_utils/code_utils.py b/codeflash/code_utils/code_utils.py
@@ -10,6 +10,10 @@
 
 from codeflash.cli_cmds.console import logger
 
+def encoded_tokens_len(s: str) -> int:
+    '''Function for returning the approximate length of the encoded tokens
+    It's an approximation of BPE encoding (https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)'''
+    return int(len(s)*0.25)
 
 def get_qualified_name(module_name: str, full_qualified_name: str) -> str:
     if not full_qualified_name:
diff --git a/codeflash/context/code_context_extractor.py b/codeflash/context/code_context_extractor.py
@@ -7,13 +7,12 @@
 
 import jedi
 import libcst as cst
-import tiktoken
 from jedi.api.classes import Name
 from libcst import CSTNode
 
 from codeflash.cli_cmds.console import logger
 from codeflash.code_utils.code_extractor import add_needed_imports_from_module, find_preexisting_objects
-from codeflash.code_utils.code_utils import get_qualified_name, path_belongs_to_site_packages
+from codeflash.code_utils.code_utils import get_qualified_name, path_belongs_to_site_packages, encoded_tokens_len
 from codeflash.context.unused_definition_remover import remove_unused_definitions_by_function_names
 from codeflash.discovery.functions_to_optimize import FunctionToOptimize
 from codeflash.models.models import (
@@ -73,8 +72,7 @@ def get_code_optimization_context(
     )
 
     # Handle token limits
-    tokenizer = tiktoken.encoding_for_model("gpt-4o")
-    final_read_writable_tokens = len(tokenizer.encode(final_read_writable_code))
+    final_read_writable_tokens = encoded_tokens_len(final_read_writable_code)
     if final_read_writable_tokens > optim_token_limit:
         raise ValueError("Read-writable code has exceeded token limit, cannot proceed")
 
@@ -87,7 +85,7 @@ def get_code_optimization_context(
     )
     read_only_context_code = read_only_code_markdown.markdown
 
-    read_only_code_markdown_tokens = len(tokenizer.encode(read_only_context_code))
+    read_only_code_markdown_tokens = encoded_tokens_len(read_only_context_code)
     total_tokens = final_read_writable_tokens + read_only_code_markdown_tokens
     if total_tokens > optim_token_limit:
         logger.debug("Code context has exceeded token limit, removing docstrings from read-only code")
@@ -96,7 +94,7 @@ def get_code_optimization_context(
             helpers_of_fto_dict, helpers_of_helpers_dict, project_root_path, remove_docstrings=True
         )
         read_only_context_code = read_only_code_no_docstring_markdown.markdown
-        read_only_code_no_docstring_markdown_tokens = len(tokenizer.encode(read_only_context_code))
+        read_only_code_no_docstring_markdown_tokens = encoded_tokens_len(read_only_context_code)
         total_tokens = final_read_writable_tokens + read_only_code_no_docstring_markdown_tokens
         if total_tokens > optim_token_limit:
             logger.debug("Code context has exceeded token limit, removing read-only code")
@@ -111,7 +109,7 @@ def get_code_optimization_context(
         code_context_type=CodeContextType.TESTGEN,
     )
     testgen_context_code = testgen_code_markdown.code
-    testgen_context_code_tokens = len(tokenizer.encode(testgen_context_code))
+    testgen_context_code_tokens = encoded_tokens_len(testgen_context_code)
     if testgen_context_code_tokens > testgen_token_limit:
         testgen_code_markdown = extract_code_string_context_from_files(
             helpers_of_fto_dict,
@@ -121,7 +119,7 @@ def get_code_optimization_context(
             code_context_type=CodeContextType.TESTGEN,
         )
         testgen_context_code = testgen_code_markdown.code
-        testgen_context_code_tokens = len(tokenizer.encode(testgen_context_code))
+        testgen_context_code_tokens = encoded_tokens_len(testgen_context_code)
         if testgen_context_code_tokens > testgen_token_limit:
             raise ValueError("Testgen code context has exceeded token limit, cannot proceed")
 
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -3,7 +3,6 @@
 import ast
 import concurrent.futures
 import os
-import shutil
 import subprocess
 import time
 import uuid
@@ -393,7 +392,7 @@ def determine_best_candidate(
             try:
                 candidate_index = 0
                 original_len = len(candidates)
-                while candidates:
+                while True:
                     done = True if future_line_profile_results is None else future_line_profile_results.done()
                     if done and (future_line_profile_results is not None):
                         line_profile_results = future_line_profile_results.result()
@@ -403,8 +402,14 @@ def determine_best_candidate(
                             f"Added results from line profiler to candidates, total candidates now: {original_len}"
                         )
                         future_line_profile_results = None
+                    try:
+                        candidate = candidates.popleft()
+                    except IndexError:
+                        if done:
+                            break
+                        time.sleep(0.1)
+                        continue
                     candidate_index += 1
-                    candidate = candidates.popleft()
                     get_run_tmp_file(Path(f"test_return_values_{candidate_index}.bin")).unlink(missing_ok=True)
                     get_run_tmp_file(Path(f"test_return_values_{candidate_index}.sqlite")).unlink(missing_ok=True)
                     logger.info(f"Optimization candidate {candidate_index}/{original_len}:")
@@ -512,7 +517,8 @@ def determine_best_candidate(
                     self.write_code_and_helpers(
                         self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path
                     )
-
+                    if done and not candidates:
+                        break
             except KeyboardInterrupt as e:
                 self.write_code_and_helpers(
                     self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path
diff --git a/pyproject.toml b/pyproject.toml
@@ -73,7 +73,6 @@ pytest = ">=7.0.0,!=8.3.4"
 gitpython = ">=3.1.31"
 libcst = ">=1.0.1"
 jedi = ">=0.19.1"
-tiktoken = ">=0.7.0"
 timeout-decorator = ">=0.5.0"
 pytest-timeout = ">=2.1.0"
 tomlkit = ">=0.11.7"