From 45cbb2341e9da718752b78ba5da7d1c107031b77 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Wed, 20 Nov 2024 15:24:13 +0800
Subject: [PATCH 1/7] Update optillm.py

add plugin_approaches as well to args
---
 optillm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/optillm.py b/optillm.py
index b7f0d35a..d78353e5 100644
--- a/optillm.py
+++ b/optillm.py
@@ -522,7 +522,7 @@ def parse_args():
     # Define arguments and their corresponding environment variables
     args_env = [
         ("--optillm-api-key", "OPTILLM_API_KEY", str, "", "Optional API key for client authentication to optillm"),
-        ("--approach", "OPTILLM_APPROACH", str, "auto", "Inference approach to use", known_approaches),
+        ("--approach", "OPTILLM_APPROACH", str, "auto", "Inference approach to use", known_approaches + list(plugin_approaches.keys())),
         ("--mcts-simulations", "OPTILLM_SIMULATIONS", int, 2, "Number of MCTS simulations"),
         ("--mcts-exploration", "OPTILLM_EXPLORATION", float, 0.2, "Exploration weight for MCTS"),
         ("--mcts-depth", "OPTILLM_DEPTH", int, 1, "Simulation depth for MCTS"),
@@ -571,10 +571,10 @@ def parse_args():
 
 def main():
     global server_config
-    args = parse_args()
-
     # Call this function at the start of main()
     load_plugins()
+    args = parse_args()
+
     # Update server_config with all argument values
     server_config.update(vars(args))
 

From f18b9210debe54a9904e61bd0ea9305b2895b344 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Thu, 21 Nov 2024 12:11:34 +0800
Subject: [PATCH 2/7] add new chain of code approach

---
 optillm.py                    |   4 +
 optillm/plugins/coc_plugin.py | 162 ++++++++++++++++++++++++++++++++++
 2 files changed, 166 insertions(+)
 create mode 100644 optillm/plugins/coc_plugin.py

diff --git a/optillm.py b/optillm.py
index d78353e5..c8b90795 100644
--- a/optillm.py
+++ b/optillm.py
@@ -395,6 +395,10 @@ def proxy():
     model = data.get('model', server_config['model'])
 
     optillm_approach = data.get('optillm_approach', server_config['approach'])
+    logger.debug(data)
+    server_config['mcts_depth'] = data.get('mcts_depth', server_config['mcts_depth'])
+    server_config['mcts_exploration' ] = data.get('mcts_exploration', server_config['mcts_exploration'])
+    server_config['mcts_simulations'] = data.get('mcts_simulations', server_config['mcts_simulations'])
 
     system_prompt, initial_query, message_optillm_approach = parse_conversation(messages)
 
diff --git a/optillm/plugins/coc_plugin.py b/optillm/plugins/coc_plugin.py
new file mode 100644
index 00000000..003ddc62
--- /dev/null
+++ b/optillm/plugins/coc_plugin.py
@@ -0,0 +1,162 @@
+import re
+import logging
+from typing import Tuple, Dict, Any, List
+import ast
+import traceback
+
+logger = logging.getLogger(__name__)
+
+# Plugin identifier
+SLUG = "coc"
+
+# Prompts
+CHAIN_OF_CODE_PROMPT = '''
+You are an AI assistant that uses Chain of Code (CoC) approach to solve problems. Follow these steps:
+
+1. Write Python code that breaks down the problem into clear steps
+2. Each step should either be:
+   - Executable Python code that performs computations
+   - Pseudocode that you will simulate with natural language understanding
+3. Track program state after each line execution
+4. Return the final answer within the <output> tags
+
+Format your response using:
+```python
+[Your code here]
+```
+
+And track state after each line with:
+delta_state: {...}
+
+Finally provide output as:
+<output>
+[Your final answer]
+</output>
+'''
+
+STATE_SIMULATION_PROMPT = '''You are simulating the execution of Python code. 
+Given the current program state and a line of code, return ONLY a Python dictionary representing the new state variables.
+Do not include any other text, code blocks, or formatting - just the Python dict.
+
+For example:
+state = {'x': 5}
+code = "y = x + 3"
+You should return:
+{'y': 8}
+'''
+
+def extract_code_blocks(text: str) -> List[str]:
+    """Extract Python code blocks from text."""
+    pattern = r'```python\s*(.*?)\s*```'
+    matches = re.findall(pattern, text, re.DOTALL)
+    return [m.strip() for m in matches]
+
+def extract_output(text: str) -> str:
+    """Extract content from output tags."""
+    pattern = r'<output>(.*?)</output>'
+    match = re.search(pattern, text, re.DOTALL)
+    return match.group(1).strip() if match else text.strip()
+
+def extract_state_updates(text: str) -> List[Dict[str, Any]]:
+    """Extract state updates from delta_state markers."""
+    pattern = r'delta_state:\s*({.*?})'
+    matches = re.findall(pattern, text, re.DOTALL)
+    states = []
+    for m in matches:
+        try:
+            # Clean up the state string before evaluation
+            cleaned = re.sub(r'```python\s*|\s*```', '', m)
+            state = ast.literal_eval(cleaned)
+            states.append(state)
+        except:
+            logger.warning(f"Could not parse state update: {m}")
+    return states
+
+def clean_state_response(response: str) -> str:
+    """Clean up LM state response to get just the dictionary."""
+    # Remove any code blocks
+    response = re.sub(r'```python\s*|\s*```', '', response)
+    # Remove any natural language before or after the dict
+    response = re.sub(r'^[^{]*', '', response)
+    response = re.sub(r'[^}]*$', '', response)
+    return response.strip()
+
+def execute_line(line: str, state: Dict[str, Any], client, model: str) -> Tuple[Any, Dict[str, Any]]:
+    """Execute a single line of code, either with Python or LM simulation."""
+    try:
+        # Try executing with Python
+        # Create a copy of state for local execution
+        local_state = state.copy()
+        exec(line, globals(), local_state)
+        # Extract any new/modified variables
+        new_state = {k:v for k,v in local_state.items() 
+                    if k not in state or state[k] != v}
+        return None, new_state
+    except Exception as e:
+        # If Python execution fails, simulate with LM
+        context = f"Current program state: {state}\nExecute line: {line}"
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": STATE_SIMULATION_PROMPT},
+                {"role": "user", "content": context}
+            ],
+            temperature=0.2
+        )
+        try:
+            # Clean and parse LM response
+            cleaned_response = clean_state_response(response.choices[0].message.content)
+            new_state = ast.literal_eval(cleaned_response)
+            return response.usage.completion_tokens, new_state
+        except Exception as e:
+            logger.error(f"Could not parse LM state response: {response.choices[0].message.content}")
+            logger.error(f"Error: {str(e)}")
+            logger.error(f"Cleaned response: {cleaned_response}")
+            return response.usage.completion_tokens, {}
+
+def run(system_prompt: str, initial_query: str, client, model: str) -> Tuple[str, int]:
+    """Main Chain of Code execution function."""
+    # Generate initial code solution
+    messages = [
+        {"role": "system", "content": system_prompt + "\n" + CHAIN_OF_CODE_PROMPT},
+        {"role": "user", "content": initial_query}
+    ]
+    
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        temperature=0.7
+    )
+    initial_response = response.choices[0].message.content
+    total_tokens = response.usage.completion_tokens
+
+    # Extract code blocks
+    code_blocks = extract_code_blocks(initial_response)
+    if not code_blocks:
+        logger.warning("No code blocks found in response")
+        return initial_response, total_tokens
+
+    # Execute code blocks line by line
+    final_state = {}
+    code = code_blocks[0]  # Take first code block
+    
+    # Split into lines and filter empty lines
+    lines = [line.strip() for line in code.split('\n') if line.strip()]
+    
+    for line in lines:
+        if not line or line.startswith('#'):
+            continue
+            
+        tokens, new_state = execute_line(line, final_state, client, model)
+        if tokens:
+            total_tokens += tokens
+        final_state.update(new_state)
+        logger.debug(f"Executed line: {line}")
+        logger.debug(f"New state: {new_state}")
+
+    # Extract output tags from the initial response, or use answer from state
+    final_answer = extract_output(initial_response)
+    if not final_answer and 'answer' in final_state:
+        final_answer = str(final_state['answer'])
+
+    return final_answer, total_tokens
\ No newline at end of file

From 1d4bad4518d0711f629d4e631ce5ee7013d6e5bf Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Thu, 21 Nov 2024 16:07:35 +0800
Subject: [PATCH 3/7] Update coc_plugin.py

fix coc
---
 optillm/plugins/coc_plugin.py | 81 +++++++++++++++++++++++++++++------
 1 file changed, 68 insertions(+), 13 deletions(-)

diff --git a/optillm/plugins/coc_plugin.py b/optillm/plugins/coc_plugin.py
index 003ddc62..a3df0c50 100644
--- a/optillm/plugins/coc_plugin.py
+++ b/optillm/plugins/coc_plugin.py
@@ -3,12 +3,19 @@
 from typing import Tuple, Dict, Any, List
 import ast
 import traceback
+import math
+import importlib
 
 logger = logging.getLogger(__name__)
 
 # Plugin identifier
 SLUG = "coc"
 
+# List of allowed modules for execution
+ALLOWED_MODULES = {
+    'math': math,
+}
+
 # Prompts
 CHAIN_OF_CODE_PROMPT = '''
 You are an AI assistant that uses Chain of Code (CoC) approach to solve problems. Follow these steps:
@@ -36,13 +43,18 @@
 
 STATE_SIMULATION_PROMPT = '''You are simulating the execution of Python code. 
 Given the current program state and a line of code, return ONLY a Python dictionary representing the new state variables.
-Do not include any other text, code blocks, or formatting - just the Python dict.
+For module imports and references, return string representations.
+
+Return ONLY primitive types (numbers, strings, lists, dicts) - no module references or complex objects.
 
 For example:
 state = {'x': 5}
 code = "y = x + 3"
-You should return:
-{'y': 8}
+Return: {'y': 8}
+
+For modules:
+code = "import math"
+Return: {'math': 'module:math'}
 '''
 
 def extract_code_blocks(text: str) -> List[str]:
@@ -64,9 +76,8 @@ def extract_state_updates(text: str) -> List[Dict[str, Any]]:
     states = []
     for m in matches:
         try:
-            # Clean up the state string before evaluation
             cleaned = re.sub(r'```python\s*|\s*```', '', m)
-            state = ast.literal_eval(cleaned)
+            state = parse_state_dict(cleaned)
             states.append(state)
         except:
             logger.warning(f"Could not parse state update: {m}")
@@ -81,17 +92,67 @@ def clean_state_response(response: str) -> str:
     response = re.sub(r'[^}]*$', '', response)
     return response.strip()
 
+def parse_state_dict(state_str: str) -> Dict[str, Any]:
+    """Safely parse state dictionary, handling module references."""
+    try:
+        # First try direct evaluation
+        return ast.literal_eval(state_str)
+    except:
+        # If that fails, try to parse manually
+        state_dict = {}
+        try:
+            # Use a custom safe eval that handles module references
+            # Remove brackets and split by commas
+            items = state_str.strip('{}').split(',')
+            for item in items:
+                if ':' not in item:
+                    continue
+                key, value = item.split(':', 1)
+                key = key.strip().strip("'").strip('"')
+                value = value.strip()
+                
+                # Handle module references
+                if 'module' in value:
+                    module_name = value.split("'")[1] if "'" in value else value.split(':')[1].strip()
+                    if module_name in ALLOWED_MODULES:
+                        state_dict[key] = ALLOWED_MODULES[module_name]
+                # Handle normal values
+                else:
+                    try:
+                        state_dict[key] = ast.literal_eval(value)
+                    except:
+                        state_dict[key] = value.strip("'").strip('"')
+            return state_dict
+        except Exception as e:
+            logger.error(f"Failed to parse state dict: {state_str}")
+            logger.error(f"Error: {str(e)}")
+            return {}
+
 def execute_line(line: str, state: Dict[str, Any], client, model: str) -> Tuple[Any, Dict[str, Any]]:
     """Execute a single line of code, either with Python or LM simulation."""
     try:
+        # Handle imports specially
+        if line.startswith('import '):
+            module_name = line.split()[1]
+            if module_name in ALLOWED_MODULES:
+                return None, {module_name: ALLOWED_MODULES[module_name]}
+            else:
+                logger.warning(f"Skipping import of unauthorized module: {module_name}")
+                return None, {}
+
         # Try executing with Python
-        # Create a copy of state for local execution
         local_state = state.copy()
+        # Add allowed modules to local state
+        for mod_name, mod in ALLOWED_MODULES.items():
+            if mod_name in state:
+                local_state[mod_name] = mod
+                
         exec(line, globals(), local_state)
         # Extract any new/modified variables
         new_state = {k:v for k,v in local_state.items() 
                     if k not in state or state[k] != v}
         return None, new_state
+        
     except Exception as e:
         # If Python execution fails, simulate with LM
         context = f"Current program state: {state}\nExecute line: {line}"
@@ -104,9 +165,8 @@ def execute_line(line: str, state: Dict[str, Any], client, model: str) -> Tuple[
             temperature=0.2
         )
         try:
-            # Clean and parse LM response
             cleaned_response = clean_state_response(response.choices[0].message.content)
-            new_state = ast.literal_eval(cleaned_response)
+            new_state = parse_state_dict(cleaned_response)
             return response.usage.completion_tokens, new_state
         except Exception as e:
             logger.error(f"Could not parse LM state response: {response.choices[0].message.content}")
@@ -116,7 +176,6 @@ def execute_line(line: str, state: Dict[str, Any], client, model: str) -> Tuple[
 
 def run(system_prompt: str, initial_query: str, client, model: str) -> Tuple[str, int]:
     """Main Chain of Code execution function."""
-    # Generate initial code solution
     messages = [
         {"role": "system", "content": system_prompt + "\n" + CHAIN_OF_CODE_PROMPT},
         {"role": "user", "content": initial_query}
@@ -130,17 +189,14 @@ def run(system_prompt: str, initial_query: str, client, model: str) -> Tuple[str
     initial_response = response.choices[0].message.content
     total_tokens = response.usage.completion_tokens
 
-    # Extract code blocks
     code_blocks = extract_code_blocks(initial_response)
     if not code_blocks:
         logger.warning("No code blocks found in response")
         return initial_response, total_tokens
 
-    # Execute code blocks line by line
     final_state = {}
     code = code_blocks[0]  # Take first code block
     
-    # Split into lines and filter empty lines
     lines = [line.strip() for line in code.split('\n') if line.strip()]
     
     for line in lines:
@@ -154,7 +210,6 @@ def run(system_prompt: str, initial_query: str, client, model: str) -> Tuple[str
         logger.debug(f"Executed line: {line}")
         logger.debug(f"New state: {new_state}")
 
-    # Extract output tags from the initial response, or use answer from state
     final_answer = extract_output(initial_response)
     if not final_answer and 'answer' in final_state:
         final_answer = str(final_state['answer'])

From 2107637826b6d6509dd8956c62579240d60cd361 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sun, 24 Nov 2024 11:47:33 +0800
Subject: [PATCH 4/7] Update coc_plugin.py

---
 optillm/plugins/coc_plugin.py | 203 ++++++++++++----------------------
 1 file changed, 71 insertions(+), 132 deletions(-)

diff --git a/optillm/plugins/coc_plugin.py b/optillm/plugins/coc_plugin.py
index a3df0c50..581baf98 100644
--- a/optillm/plugins/coc_plugin.py
+++ b/optillm/plugins/coc_plugin.py
@@ -5,6 +5,7 @@
 import traceback
 import math
 import importlib
+import json
 
 logger = logging.getLogger(__name__)
 
@@ -24,163 +25,105 @@
 2. Each step should either be:
    - Executable Python code that performs computations
    - Pseudocode that you will simulate with natural language understanding
-3. Track program state after each line execution
+3. Track final result in an 'answer' variable
 4. Return the final answer within the <output> tags
 
 Format your response using:
 ```python
-[Your code here]
+[Your complete Python program here]
 ```
 
-And track state after each line with:
-delta_state: {...}
-
 Finally provide output as:
 <output>
 [Your final answer]
 </output>
 '''
 
-STATE_SIMULATION_PROMPT = '''You are simulating the execution of Python code. 
-Given the current program state and a line of code, return ONLY a Python dictionary representing the new state variables.
-For module imports and references, return string representations.
-
-Return ONLY primitive types (numbers, strings, lists, dicts) - no module references or complex objects.
+STATE_SIMULATION_PROMPT = '''You are simulating the execution of a Python program.
+Given the code below, simulate its execution and return the final value that would be in the 'answer' variable.
+Return ONLY the final value, no explanations or additional text.
 
-For example:
-state = {'x': 5}
-code = "y = x + 3"
-Return: {'y': 8}
-
-For modules:
-code = "import math"
-Return: {'math': 'module:math'}
+Code to simulate:
+{code}
 '''
 
 def extract_code_blocks(text: str) -> List[str]:
     """Extract Python code blocks from text."""
     pattern = r'```python\s*(.*?)\s*```'
     matches = re.findall(pattern, text, re.DOTALL)
-    return [m.strip() for m in matches]
+    blocks = [m.strip() for m in matches]
+    logger.info(f"Extracted {len(blocks)} code blocks")
+    for i, block in enumerate(blocks):
+        logger.info(f"Code block {i+1}:\n{block}")
+    return blocks
 
 def extract_output(text: str) -> str:
     """Extract content from output tags."""
     pattern = r'<output>(.*?)</output>'
     match = re.search(pattern, text, re.DOTALL)
-    return match.group(1).strip() if match else text.strip()
-
-def extract_state_updates(text: str) -> List[Dict[str, Any]]:
-    """Extract state updates from delta_state markers."""
-    pattern = r'delta_state:\s*({.*?})'
-    matches = re.findall(pattern, text, re.DOTALL)
-    states = []
-    for m in matches:
-        try:
-            cleaned = re.sub(r'```python\s*|\s*```', '', m)
-            state = parse_state_dict(cleaned)
-            states.append(state)
-        except:
-            logger.warning(f"Could not parse state update: {m}")
-    return states
-
-def clean_state_response(response: str) -> str:
-    """Clean up LM state response to get just the dictionary."""
-    # Remove any code blocks
-    response = re.sub(r'```python\s*|\s*```', '', response)
-    # Remove any natural language before or after the dict
-    response = re.sub(r'^[^{]*', '', response)
-    response = re.sub(r'[^}]*$', '', response)
-    return response.strip()
-
-def parse_state_dict(state_str: str) -> Dict[str, Any]:
-    """Safely parse state dictionary, handling module references."""
-    try:
-        # First try direct evaluation
-        return ast.literal_eval(state_str)
-    except:
-        # If that fails, try to parse manually
-        state_dict = {}
-        try:
-            # Use a custom safe eval that handles module references
-            # Remove brackets and split by commas
-            items = state_str.strip('{}').split(',')
-            for item in items:
-                if ':' not in item:
-                    continue
-                key, value = item.split(':', 1)
-                key = key.strip().strip("'").strip('"')
-                value = value.strip()
-                
-                # Handle module references
-                if 'module' in value:
-                    module_name = value.split("'")[1] if "'" in value else value.split(':')[1].strip()
-                    if module_name in ALLOWED_MODULES:
-                        state_dict[key] = ALLOWED_MODULES[module_name]
-                # Handle normal values
-                else:
-                    try:
-                        state_dict[key] = ast.literal_eval(value)
-                    except:
-                        state_dict[key] = value.strip("'").strip('"')
-            return state_dict
-        except Exception as e:
-            logger.error(f"Failed to parse state dict: {state_str}")
-            logger.error(f"Error: {str(e)}")
-            return {}
-
-def execute_line(line: str, state: Dict[str, Any], client, model: str) -> Tuple[Any, Dict[str, Any]]:
-    """Execute a single line of code, either with Python or LM simulation."""
+    result = match.group(1).strip() if match else text.strip()
+    logger.info(f"Extracted output: {result}")
+    return result
+
+def execute_code(code: str, client, model: str) -> Tuple[Any, int]:
+    """Execute full code block either with Python or LM simulation."""
+    logger.info("Attempting to execute complete code block")
+    logger.info(f"Code:\n{code}")
+    
+    # Add imports
+    execution_env = {}
+    for mod_name, mod in ALLOWED_MODULES.items():
+        execution_env[mod_name] = mod
+    
     try:
-        # Handle imports specially
-        if line.startswith('import '):
-            module_name = line.split()[1]
-            if module_name in ALLOWED_MODULES:
-                return None, {module_name: ALLOWED_MODULES[module_name]}
-            else:
-                logger.warning(f"Skipping import of unauthorized module: {module_name}")
-                return None, {}
-
-        # Try executing with Python
-        local_state = state.copy()
-        # Add allowed modules to local state
-        for mod_name, mod in ALLOWED_MODULES.items():
-            if mod_name in state:
-                local_state[mod_name] = mod
-                
-        exec(line, globals(), local_state)
-        # Extract any new/modified variables
-        new_state = {k:v for k,v in local_state.items() 
-                    if k not in state or state[k] != v}
-        return None, new_state
+        # Try executing the complete code block with Python
+        logger.info("Attempting Python execution")
+        exec(code, execution_env)
+        answer = execution_env.get('answer')
+        logger.info(f"Python execution successful. Answer: {answer}")
+        return answer, 0
         
     except Exception as e:
+        logger.info(f"Python execution failed: {str(e)}")
+        logger.info("Falling back to LM simulation")
+        
         # If Python execution fails, simulate with LM
-        context = f"Current program state: {state}\nExecute line: {line}"
         response = client.chat.completions.create(
             model=model,
             messages=[
-                {"role": "system", "content": STATE_SIMULATION_PROMPT},
-                {"role": "user", "content": context}
+                {"role": "system", "content": STATE_SIMULATION_PROMPT.format(code=code)},
+                {"role": "user", "content": "Simulate this code and return the final value of 'answer'."}
             ],
             temperature=0.2
         )
+        
         try:
-            cleaned_response = clean_state_response(response.choices[0].message.content)
-            new_state = parse_state_dict(cleaned_response)
-            return response.usage.completion_tokens, new_state
+            answer = response.choices[0].message.content.strip()
+            logger.info(f"LM simulation successful. Answer: {answer}")
+            
+            # Try to convert to number if possible
+            try:
+                answer = ast.literal_eval(answer)
+            except:
+                pass
+                
+            return answer, response.usage.completion_tokens
+            
         except Exception as e:
-            logger.error(f"Could not parse LM state response: {response.choices[0].message.content}")
-            logger.error(f"Error: {str(e)}")
-            logger.error(f"Cleaned response: {cleaned_response}")
-            return response.usage.completion_tokens, {}
+            logger.error(f"Could not parse LM simulation response: {str(e)}")
+            return None, response.usage.completion_tokens
 
 def run(system_prompt: str, initial_query: str, client, model: str) -> Tuple[str, int]:
     """Main Chain of Code execution function."""
+    logger.info("Starting Chain of Code execution")
+    logger.info(f"Query: {initial_query}")
+    
     messages = [
         {"role": "system", "content": system_prompt + "\n" + CHAIN_OF_CODE_PROMPT},
         {"role": "user", "content": initial_query}
     ]
     
+    logger.info("Generating code solution")
     response = client.chat.completions.create(
         model=model,
         messages=messages,
@@ -188,30 +131,26 @@ def run(system_prompt: str, initial_query: str, client, model: str) -> Tuple[str
     )
     initial_response = response.choices[0].message.content
     total_tokens = response.usage.completion_tokens
+    
+    logger.info("Initial response from LM:")
+    logger.info(initial_response)
 
     code_blocks = extract_code_blocks(initial_response)
     if not code_blocks:
         logger.warning("No code blocks found in response")
         return initial_response, total_tokens
 
-    final_state = {}
-    code = code_blocks[0]  # Take first code block
-    
-    lines = [line.strip() for line in code.split('\n') if line.strip()]
-    
-    for line in lines:
-        if not line or line.startswith('#'):
-            continue
+    # Execute the complete code block
+    code = code_blocks[0]
+    answer, execution_tokens = execute_code(code, client, model)
+    total_tokens += execution_tokens
+
+    # If we got an answer from code execution, use it
+    if answer is not None:
+        final_answer = str(answer)
+    else:
+        # Fall back to output tags if code execution failed
+        final_answer = extract_output(initial_response)
             
-        tokens, new_state = execute_line(line, final_state, client, model)
-        if tokens:
-            total_tokens += tokens
-        final_state.update(new_state)
-        logger.debug(f"Executed line: {line}")
-        logger.debug(f"New state: {new_state}")
-
-    final_answer = extract_output(initial_response)
-    if not final_answer and 'answer' in final_state:
-        final_answer = str(final_state['answer'])
-
+    logger.info(f"Chain of Code execution completed. Final answer: {final_answer}")
     return final_answer, total_tokens
\ No newline at end of file

From 7276138307571a7f81335cdd2d05754dd3ca758b Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sun, 24 Nov 2024 17:47:58 +0800
Subject: [PATCH 5/7] Update coc_plugin.py

---
 optillm/plugins/coc_plugin.py | 265 +++++++++++++++++++++++-----------
 1 file changed, 182 insertions(+), 83 deletions(-)

diff --git a/optillm/plugins/coc_plugin.py b/optillm/plugins/coc_plugin.py
index 581baf98..b575d2f0 100644
--- a/optillm/plugins/coc_plugin.py
+++ b/optillm/plugins/coc_plugin.py
@@ -12,39 +12,70 @@
 # Plugin identifier
 SLUG = "coc"
 
+# Maximum attempts to fix code
+MAX_FIX_ATTEMPTS = 3
+
 # List of allowed modules for execution
 ALLOWED_MODULES = {
     'math': math,
 }
 
-# Prompts
+# Initial code generation prompt
 CHAIN_OF_CODE_PROMPT = '''
-You are an AI assistant that uses Chain of Code (CoC) approach to solve problems. Follow these steps:
-
-1. Write Python code that breaks down the problem into clear steps
-2. Each step should either be:
-   - Executable Python code that performs computations
-   - Pseudocode that you will simulate with natural language understanding
-3. Track final result in an 'answer' variable
-4. Return the final answer within the <output> tags
+Write Python code to solve this problem. The code should:
+1. Break down the problem into clear computational steps
+2. Use standard Python features and math operations
+3. Store the final result in a variable named 'answer'
+4. Include error handling where appropriate
+5. Be complete and executable
 
 Format your response using:
 ```python
 [Your complete Python program here]
 ```
+'''
+
+# Code fix prompt
+CODE_FIX_PROMPT = '''
+The following Python code failed to execute. Fix the code to make it work.
+Original code:
+```python
+{code}
+```
 
-Finally provide output as:
-<output>
-[Your final answer]
-</output>
+Error encountered:
+{error}
+
+Please provide a complete, fixed version of the code that:
+1. Addresses the error message
+2. Maintains the same logic and approach
+3. Stores the final result in 'answer'
+4. Is complete and executable
+
+Return only the fixed code in a code block:
+```python
+[Your fixed code here]
+```
 '''
 
-STATE_SIMULATION_PROMPT = '''You are simulating the execution of a Python program.
-Given the code below, simulate its execution and return the final value that would be in the 'answer' variable.
-Return ONLY the final value, no explanations or additional text.
+# Simulation prompt
+SIMULATION_PROMPT = '''
+The following Python code could not be executed after several attempts. 
+Please simulate its execution and determine the final value that would be in the 'answer' variable.
 
 Code to simulate:
+```python
 {code}
+```
+
+Last error encountered:
+{error}
+
+Important:
+1. Follow the logic of the code exactly
+2. Perform all calculations carefully
+3. Return ONLY the final numeric or string value, no explanations
+4. If the code contains semantic functions (like text analysis), use your judgment to simulate them
 '''
 
 def extract_code_blocks(text: str) -> List[str]:
@@ -57,100 +88,168 @@ def extract_code_blocks(text: str) -> List[str]:
         logger.info(f"Code block {i+1}:\n{block}")
     return blocks
 
-def extract_output(text: str) -> str:
-    """Extract content from output tags."""
-    pattern = r'<output>(.*?)</output>'
-    match = re.search(pattern, text, re.DOTALL)
-    result = match.group(1).strip() if match else text.strip()
-    logger.info(f"Extracted output: {result}")
-    return result
-
-def execute_code(code: str, client, model: str) -> Tuple[Any, int]:
-    """Execute full code block either with Python or LM simulation."""
-    logger.info("Attempting to execute complete code block")
+def sanitize_code(code: str) -> str:
+    """Prepare code for execution by adding necessary imports and safety checks."""
+    # Add standard imports
+    imports = "\n".join(f"import {mod}" for mod in ALLOWED_MODULES)
+    
+    # Add safety wrapper
+    wrapper = f"""
+{imports}
+
+def safe_execute():
+    {code.replace('\n', '\n    ')}
+    return answer if 'answer' in locals() else None
+
+result = safe_execute()
+answer = result
+"""
+    return wrapper
+
+def execute_code(code: str) -> Tuple[Any, str]:
+    """Attempt to execute the code and return result or error."""
+    logger.info("Attempting to execute code")
     logger.info(f"Code:\n{code}")
     
-    # Add imports
     execution_env = {}
-    for mod_name, mod in ALLOWED_MODULES.items():
-        execution_env[mod_name] = mod
-    
     try:
-        # Try executing the complete code block with Python
-        logger.info("Attempting Python execution")
-        exec(code, execution_env)
+        sanitized_code = sanitize_code(code)
+        exec(sanitized_code, execution_env)
         answer = execution_env.get('answer')
-        logger.info(f"Python execution successful. Answer: {answer}")
-        return answer, 0
         
+        if answer is not None:
+            logger.info(f"Execution successful. Answer: {answer}")
+            return answer, None
+        else:
+            error = "Code executed but did not produce an answer"
+            logger.warning(error)
+            return None, error
+            
     except Exception as e:
-        logger.info(f"Python execution failed: {str(e)}")
-        logger.info("Falling back to LM simulation")
-        
-        # If Python execution fails, simulate with LM
-        response = client.chat.completions.create(
-            model=model,
-            messages=[
-                {"role": "system", "content": STATE_SIMULATION_PROMPT.format(code=code)},
-                {"role": "user", "content": "Simulate this code and return the final value of 'answer'."}
-            ],
-            temperature=0.2
-        )
-        
+        error = f"{type(e).__name__}: {str(e)}\n{traceback.format_exc()}"
+        logger.error(f"Execution failed: {error}")
+        return None, error
+
+def generate_fixed_code(original_code: str, error: str, client, model: str) -> Tuple[str, int]:
+    """Ask LLM to fix the broken code."""
+    logger.info("Requesting code fix from LLM")
+    logger.info(f"Original error: {error}")
+    
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": CODE_FIX_PROMPT.format(
+                code=original_code, error=error)},
+            {"role": "user", "content": "Fix the code to make it work."}
+        ],
+        temperature=0.2
+    )
+    
+    fixed_code = response.choices[0].message.content
+    code_blocks = extract_code_blocks(fixed_code)
+    
+    if code_blocks:
+        logger.info("Received fixed code from LLM")
+        return code_blocks[0], response.usage.completion_tokens
+    else:
+        logger.warning("No code block found in LLM response")
+        return None, response.usage.completion_tokens
+
+def simulate_execution(code: str, error: str, client, model: str) -> Tuple[Any, int]:
+    """Ask LLM to simulate code execution."""
+    logger.info("Attempting code simulation with LLM")
+    
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": SIMULATION_PROMPT.format(
+                code=code, error=error)},
+            {"role": "user", "content": "Simulate this code and return the final answer value."}
+        ],
+        temperature=0.2
+    )
+    
+    try:
+        result = response.choices[0].message.content.strip()
+        # Try to convert to appropriate type
         try:
-            answer = response.choices[0].message.content.strip()
-            logger.info(f"LM simulation successful. Answer: {answer}")
-            
-            # Try to convert to number if possible
-            try:
-                answer = ast.literal_eval(answer)
-            except:
-                pass
-                
-            return answer, response.usage.completion_tokens
-            
-        except Exception as e:
-            logger.error(f"Could not parse LM simulation response: {str(e)}")
-            return None, response.usage.completion_tokens
+            answer = ast.literal_eval(result)
+        except:
+            answer = result
+        logger.info(f"Simulation successful. Result: {answer}")
+        return answer, response.usage.completion_tokens
+    except Exception as e:
+        logger.error(f"Failed to parse simulation result: {str(e)}")
+        return None, response.usage.completion_tokens
 
 def run(system_prompt: str, initial_query: str, client, model: str) -> Tuple[str, int]:
     """Main Chain of Code execution function."""
     logger.info("Starting Chain of Code execution")
     logger.info(f"Query: {initial_query}")
     
+    # Initial code generation
     messages = [
         {"role": "system", "content": system_prompt + "\n" + CHAIN_OF_CODE_PROMPT},
         {"role": "user", "content": initial_query}
     ]
     
-    logger.info("Generating code solution")
     response = client.chat.completions.create(
         model=model,
         messages=messages,
         temperature=0.7
     )
-    initial_response = response.choices[0].message.content
     total_tokens = response.usage.completion_tokens
     
-    logger.info("Initial response from LM:")
-    logger.info(initial_response)
-
-    code_blocks = extract_code_blocks(initial_response)
+    # Extract initial code
+    code_blocks = extract_code_blocks(response.choices[0].message.content)
     if not code_blocks:
         logger.warning("No code blocks found in response")
-        return initial_response, total_tokens
-
-    # Execute the complete code block
-    code = code_blocks[0]
-    answer, execution_tokens = execute_code(code, client, model)
-    total_tokens += execution_tokens
+        return response.choices[0].message.content, total_tokens
 
-    # If we got an answer from code execution, use it
-    if answer is not None:
-        final_answer = str(answer)
-    else:
-        # Fall back to output tags if code execution failed
-        final_answer = extract_output(initial_response)
+    current_code = code_blocks[0]
+    fix_attempts = 0
+    last_error = None
+    
+    # Strategy 1: Direct execution and fix attempts
+    while fix_attempts < MAX_FIX_ATTEMPTS:
+        fix_attempts += 1
+        logger.info(f"Execution attempt {fix_attempts}/{MAX_FIX_ATTEMPTS}")
+        
+        # Try to execute current code
+        answer, error = execute_code(current_code)
+        
+        # If successful, return the answer
+        if error is None:
+            logger.info(f"Successful execution on attempt {fix_attempts}")
+            return str(answer), total_tokens
             
-    logger.info(f"Chain of Code execution completed. Final answer: {final_answer}")
-    return final_answer, total_tokens
\ No newline at end of file
+        last_error = error
+        
+        # If we hit max attempts, break to try simulation
+        if fix_attempts >= MAX_FIX_ATTEMPTS:
+            logger.warning(f"Failed after {fix_attempts} fix attempts")
+            break
+            
+        # Otherwise, try to get fixed code from LLM
+        logger.info(f"Requesting code fix, attempt {fix_attempts}")
+        fixed_code, fix_tokens = generate_fixed_code(current_code, error, client, model)
+        total_tokens += fix_tokens
+        
+        if fixed_code:
+            current_code = fixed_code
+        else:
+            logger.error("Failed to get fixed code from LLM")
+            break
+    
+    # Strategy 2: If all execution attempts failed, try simulation
+    logger.info("All execution attempts failed, trying simulation")
+    simulated_answer, sim_tokens = simulate_execution(current_code, last_error, client, model)
+    total_tokens += sim_tokens
+    
+    if simulated_answer is not None:
+        logger.info("Successfully got answer from simulation")
+        return str(simulated_answer), total_tokens
+    
+    # If we get here, everything failed
+    logger.warning("All strategies failed")
+    return f"Error: Could not solve problem after all attempts. Last error: {last_error}", total_tokens
\ No newline at end of file

From 3d4f97094373c023b68af1cde7ddcaf74dc3cb92 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sun, 24 Nov 2024 22:58:29 +0800
Subject: [PATCH 6/7] Update coc_plugin.py

fix visualization error
---
 optillm/plugins/coc_plugin.py | 50 +++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/optillm/plugins/coc_plugin.py b/optillm/plugins/coc_plugin.py
index b575d2f0..62ef5ce6 100644
--- a/optillm/plugins/coc_plugin.py
+++ b/optillm/plugins/coc_plugin.py
@@ -18,6 +18,7 @@
 # List of allowed modules for execution
 ALLOWED_MODULES = {
     'math': math,
+    'numpy': 'numpy',  # String indicates module should be imported in execution context
 }
 
 # Initial code generation prompt
@@ -60,22 +61,21 @@
 
 # Simulation prompt
 SIMULATION_PROMPT = '''
-The following Python code could not be executed after several attempts. 
-Please simulate its execution and determine the final value that would be in the 'answer' variable.
+The following Python code could not be executed directly. Analyze the code and determine what the answer would be.
+Pay special attention to:
+1. The core computational logic, ignoring any visualization or display code
+2. The key mathematical operations that determine the final answer
+3. Any logic that affects the 'answer' variable
 
-Code to simulate:
+Code to analyze:
 ```python
 {code}
 ```
 
-Last error encountered:
+Runtime error encountered:
 {error}
 
-Important:
-1. Follow the logic of the code exactly
-2. Perform all calculations carefully
-3. Return ONLY the final numeric or string value, no explanations
-4. If the code contains semantic functions (like text analysis), use your judgment to simulate them
+Return ONLY the final value that would be in the 'answer' variable. Return just the value, no explanations.
 '''
 
 def extract_code_blocks(text: str) -> List[str]:
@@ -93,12 +93,25 @@ def sanitize_code(code: str) -> str:
     # Add standard imports
     imports = "\n".join(f"import {mod}" for mod in ALLOWED_MODULES)
     
+    # Remove or modify problematic visualization code
+    lines = code.split('\n')
+    safe_lines = []
+    for line in lines:
+        # Skip matplotlib-related imports and plotting commands
+        if any(x in line.lower() for x in ['matplotlib', 'plt.', '.plot(', '.show(', 'figure', 'subplot']):
+            continue
+        # Keep the line if it's not visualization-related
+        safe_lines.append(line)
+    
+    safe_code = '\n'.join(safe_lines)
+    
     # Add safety wrapper
     wrapper = f"""
 {imports}
 
 def safe_execute():
-    {code.replace('\n', '\n    ')}
+    import numpy as np  # Always allow numpy
+    {safe_code.replace('\n', '\n    ')}
     return answer if 'answer' in locals() else None
 
 result = safe_execute()
@@ -111,22 +124,25 @@ def execute_code(code: str) -> Tuple[Any, str]:
     logger.info("Attempting to execute code")
     logger.info(f"Code:\n{code}")
     
-    execution_env = {}
     try:
-        sanitized_code = sanitize_code(code)
-        exec(sanitized_code, execution_env)
-        answer = execution_env.get('answer')
+        # Create a clean environment
+        execution_env = {}
+        
+        # Execute the code as-is
+        exec(code, execution_env)
         
-        if answer is not None:
+        # Look for answer variable
+        if 'answer' in execution_env:
+            answer = execution_env['answer']
             logger.info(f"Execution successful. Answer: {answer}")
             return answer, None
         else:
-            error = "Code executed but did not produce an answer"
+            error = "Code executed but did not produce an answer variable"
             logger.warning(error)
             return None, error
             
     except Exception as e:
-        error = f"{type(e).__name__}: {str(e)}\n{traceback.format_exc()}"
+        error = str(e)
         logger.error(f"Execution failed: {error}")
         return None, error
 

From 90bef2e7c618c24c72f7abd6be793d62e41b4404 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 25 Nov 2024 12:34:01 +0800
Subject: [PATCH 7/7] Update README.md

---
 README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/README.md b/README.md
index c7354e99..818036d5 100644
--- a/README.md
+++ b/README.md
@@ -218,6 +218,7 @@ response = client.chat.completions.create(
 | Plugin                  | Slug               | Description                                                                                    |
 | ----------------------- | ------------------ | ---------------------------------------------------------------------------------------------- |
 | Router                  | `router`           | Uses the [optillm-bert-uncased](https://huggingface.co/codelion/optillm-bert-uncased) model to route requests to different approaches based on the user prompt |
+| Chain-of-Code           | `coc`              | Implements a chain of code approach that combines CoT with code execution and LLM based code simulation |
 | Memory                  | `memory`           | Implements a short term memory layer, enables you to use unbounded context length with any LLM |
 | Privacy                 | `privacy`          | Anonymize PII data in request and deanonymize it back to original value in response            |
 | Read URLs               | `readurls`         | Reads all URLs found in the request, fetches the content at the URL and adds it to the context |
@@ -290,6 +291,20 @@ Authorization: Bearer your_secret_api_key
 ```
 ## SOTA results on benchmarks with optillm
 
+### coc-claude-3-5-sonnet-20241022 on AIME 2024 pass@1 (Nov 2024)
+
+| Model | Score |
+|-------|-----:|
+| o1-mini | 56.67 |
+| coc-claude-3-5-sonnet-20241022 | 46.67 |
+| coc-gemini/gemini-exp-1121 | 46.67 |
+| o1-preview | 40.00 |
+| f1-preview | 40.00 | 
+| gemini-exp-1114 | 36.67 |
+| claude-3-5-sonnet-20241022 | 20.00 |
+| gemini-1.5-pro-002 | 20.00 |
+| gemini-1.5-flash-002 | 16.67 |
+
 ### readurls&memory-gpt-4o-mini on Google FRAMES Benchmark (Oct 2024)
 | Model | Accuracy | 
 | ----- | -------- |
@@ -324,6 +339,7 @@ called patchflows. We saw huge performance gains across all the supported patchf
 
 ## References
 
+- [Chain of Code: Reasoning with a Language Model-Augmented Code Emulator](https://arxiv.org/abs/2312.04474) - [Implementation](https://github.com/codelion/optillm/blob/main/optillm/plugins/coc_plugin.py)
 - [Entropy Based Sampling and Parallel CoT Decoding](https://github.com/xjdr-alt/entropix) - [Implementation](https://github.com/codelion/optillm/blob/main/optillm/entropy_decoding.py)
 - [Fact, Fetch, and Reason: A Unified Evaluation of Retrieval-Augmented Generation](https://arxiv.org/abs/2409.12941) - [Evaluation script](https://github.com/codelion/optillm/blob/main/scripts/eval_frames_benchmark.py)
 - [Writing in the Margins: Better Inference Pattern for Long Context Retrieval](https://www.arxiv.org/abs/2408.14906) - [Inspired the implementation of the memory plugin](https://github.com/codelion/optillm/blob/main/optillm/plugins/memory_plugin.py)