From ecf14c9c54b84bbf9a67c2564521bbe80f8cf688 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Wed, 25 Sep 2024 07:57:15 -0700
Subject: [PATCH 1/9] Create wim.py

---
 optillm/wim.py | 107 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 optillm/wim.py
diff --git a/optillm/wim.py b/optillm/wim.py
new file mode 100644
index 00000000..69c5a84c
--- /dev/null
+++ b/optillm/wim.py
@@ -0,0 +1,107 @@
+from collections import deque
+import tiktoken
+
+class WiMInfiniteContextAPI:
+    def __init__(self, system_prompt, client, model, max_context_tokens=8192, max_margins=50, chunk_size=2000):
+        self.model = model
+        self.max_context_tokens = max_context_tokens
+        self.max_margins = max_margins
+        self.chunk_size = chunk_size
+        self.context_buffer = deque()
+        self.margins = deque(maxlen=max_margins)
+        self.tokenizer = tiktoken.encoding_for_model(model)
+        self.system_message = system_prompt
+        self.client = client
+
+    def count_tokens(self, text):
+        return len(self.tokenizer.encode(text))
+
+    def trim_context_buffer(self):
+        while self.count_tokens("".join(self.context_buffer)) > self.max_context_tokens:
+            self.context_buffer.popleft()
+
+    def generate_margin(self, chunk, query):
+        messages = [
+            {"role": "system", "content": self.system_message},
+            {"role": "user", "content": f"""
+'''text
+{chunk}
+'''
+Copy over all context relevant to the query: {query}
+Provide the answer in the format: <YES/NO>#<Relevant context>.
+Here are rules:
+- If you don't know how to answer the query - start your answer with NO#
+- If the text is not related to the query - start your answer with NO#
+- If you can extract relevant information - start your answer with YES#
+- If the text does not mention the person by name - start your answer with NO#
+Example answers:
+- YES#Western philosophy originated in Ancient Greece in the 6th century BCE with the pre-Socratics.
+- NO#No relevant context.
+"""}
+        ]
+        response = self.client.ChatCompletion.create(
+            model=self.model,
+            messages=messages
+        )
+        return response.choices[0].message['content']
+
+    def classify_margin(self, margin):
+        return margin.startswith("YES#")
+
+    def process_chunk(self, chunk, query):
+        self.context_buffer.append(chunk)
+        self.trim_context_buffer()
+        margin = self.generate_margin(chunk, query)
+        if self.classify_margin(margin):
+            self.margins.append(margin.split("#", 1)[1])
+
+    def process_stream(self, text_stream, query):
+        for chunk in text_stream:
+            self.process_chunk(chunk, query)
+
+    def generate_final_answer(self, query):
+        context = "".join(self.context_buffer)
+        margins = "\n".join(self.margins)
+        messages = [
+            {"role": "system", "content": self.system_message},
+            {"role": "user", "content": f"""
+'''text
+{context}
+'''
+I asked my assistant to read and analyse the above content page by page to help you complete this task. These are margin notes left on each page:
+'''text
+{margins}
+'''
+Read again the note(s) and the provided content, take a deep breath and answer the query.
+{self.instruction}
+{query}
+"""}
+        ]
+        response = self.client.ChatCompletion.create(
+            model=self.model,
+            messages=messages
+        )
+        return response.choices[0].message['content']
+
+    def run(self, text_stream, query):
+        self.process_stream(text_stream, query)
+        return self.generate_final_answer(query)
+
+    @property
+    def instruction(self):
+        return "Answer the following question based on the provided context and margin notes:"
+
+# Usage
+def text_stream_generator(text, chunk_size):
+    for i in range(0, len(text), chunk_size):
+        yield text[i:i+chunk_size]
+
+api_key = "your-api-key-here"
+wim = WiMInfiniteContextAPI(api_key)
+
+text = "Very long text..."  # Your infinite context here
+query = "What is the main topic?"
+
+text_stream = text_stream_generator(text, wim.chunk_size)
+final_answer = wim.run(text_stream, query)
+print(final_answer)

From 6f9032c3d5e6ba82e00e46861750dd96e006d878 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Wed, 25 Sep 2024 15:15:00 -0700
Subject: [PATCH 2/9] init

---
 optillm.py       |  6 +++++-
 optillm/wim.py   | 55 ++++++++++++++++++++++++++++++------------------
 requirements.txt |  1 +
 3 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/optillm.py b/optillm.py
index 241bd0a6..d54ab98a 100644
--- a/optillm.py
+++ b/optillm.py
@@ -18,6 +18,7 @@
 from optillm.plansearch import plansearch
 from optillm.leap import leap
 from optillm.reread import re2_approach
+from optillm.wim import WiMInfiniteContextAPI
 
 # Setup logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -72,7 +73,7 @@
 
 # List of known approaches
 known_approaches = ["mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar",
-                    "cot_reflection", "plansearch", "leap", "re2"]
+                    "cot_reflection", "plansearch", "leap", "re2", "wim"]
 
 # Optional API key configuration to secure the proxy
 @app.before_request
@@ -155,6 +156,9 @@ def proxy():
             final_response, completion_tokens = leap(system_prompt, initial_query, client, model)
         elif approach == 're2':
             final_response, completion_tokens = re2_approach(system_prompt, initial_query, client, model, n=n)
+        elif approach == "wim":
+            wim_context = WiMInfiniteContextAPI(system_prompt, client, model, max_context_tokens=64000, max_margins=10, chunk_size=16000)
+            final_response, completion_tokens = wim_context.process_query(initial_query)
         else:
             raise ValueError(f"Unknown approach: {approach}")
     except Exception as e:
diff --git a/optillm/wim.py b/optillm/wim.py
index 69c5a84c..56596a08 100644
--- a/optillm/wim.py
+++ b/optillm/wim.py
@@ -1,17 +1,22 @@
 from collections import deque
 import tiktoken
+import re
 
 class WiMInfiniteContextAPI:
-    def __init__(self, system_prompt, client, model, max_context_tokens=8192, max_margins=50, chunk_size=2000):
+    def __init__(self, system_prompt, client, model, max_context_tokens=64000, max_margins=10, chunk_size=16000):
         self.model = model
         self.max_context_tokens = max_context_tokens
         self.max_margins = max_margins
         self.chunk_size = chunk_size
         self.context_buffer = deque()
         self.margins = deque(maxlen=max_margins)
-        self.tokenizer = tiktoken.encoding_for_model(model)
+        try:
+            self.tokenizer = tiktoken.encoding_for_model(model) 
+        except:
+            self.tokenizer = tiktoken.get_encoding("o200k_base")
         self.system_message = system_prompt
         self.client = client
+        self.win_completion_tokens = 0
 
     def count_tokens(self, text):
         return len(self.tokenizer.encode(text))
@@ -39,14 +44,27 @@ def generate_margin(self, chunk, query):
 - NO#No relevant context.
 """}
         ]
-        response = self.client.ChatCompletion.create(
+        response = self.client.chat.completions.create(
             model=self.model,
-            messages=messages
+            messages=messages,
+            max_tokens = 512
         )
-        return response.choices[0].message['content']
+        self.win_completion_tokens += response.usage.completion_tokens
+        return response.choices[0].message.content
 
     def classify_margin(self, margin):
         return margin.startswith("YES#")
+    
+    def extract_query(self, text):
+        # Split the text into sentences
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        
+        # Check if the last sentence starts with "Query:"
+        if sentences[-1].startswith("Query:"):
+            return sentences[-1][6:].strip(), "".join(sentences[:-1])
+        
+        # If not, assume the last sentence is the query
+        return sentences[-1].strip(), "".join(sentences[:-1])
 
     def process_chunk(self, chunk, query):
         self.context_buffer.append(chunk)
@@ -77,11 +95,12 @@ def generate_final_answer(self, query):
 {query}
 """}
         ]
-        response = self.client.ChatCompletion.create(
+        response = self.client.chat.completions.create(
             model=self.model,
             messages=messages
         )
-        return response.choices[0].message['content']
+        self.win_completion_tokens += response.usage.completion_tokens
+        return response.choices[0].message.content
 
     def run(self, text_stream, query):
         self.process_stream(text_stream, query)
@@ -91,17 +110,13 @@ def run(self, text_stream, query):
     def instruction(self):
         return "Answer the following question based on the provided context and margin notes:"
 
-# Usage
-def text_stream_generator(text, chunk_size):
-    for i in range(0, len(text), chunk_size):
-        yield text[i:i+chunk_size]
-
-api_key = "your-api-key-here"
-wim = WiMInfiniteContextAPI(api_key)
-
-text = "Very long text..."  # Your infinite context here
-query = "What is the main topic?"
+    # Usage
+    def text_stream_generator(self, text):
+        for i in range(0, len(text), self.chunk_size):
+            yield text[i:i+self.chunk_size]
 
-text_stream = text_stream_generator(text, wim.chunk_size)
-final_answer = wim.run(text_stream, query)
-print(final_answer)
+    def process_query(self, initial_query):
+        query, context = self.extract_query(initial_query)
+        text_stream = self.text_stream_generator(context)
+        final_answer = self.run(text_stream, query)
+        return final_answer, self.win_completion_tokens
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index f771d90d..a6e431ec 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,4 @@ flask
 torch
 transformers
 azure.identity
+tiktoken

From b4f6defc0912c5ed3891ce866c514d88b368546d Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 30 Sep 2024 02:00:24 -0700
Subject: [PATCH 3/9] add plugins system

---
 optillm.py                       |  27 ++++++--
 optillm/plugins/memory_plugin.py | 103 +++++++++++++++++++++++++++++++
 requirements.txt                 |   1 +
 test_cases.json                  |   5 ++
 4 files changed, 132 insertions(+), 4 deletions(-)
 create mode 100644 optillm/plugins/memory_plugin.py

diff --git a/optillm.py b/optillm.py
index 31a7d878..4d95aefd 100644
--- a/optillm.py
+++ b/optillm.py
@@ -6,6 +6,8 @@
 from openai import AzureOpenAI, OpenAI
 from flask import Response
 import json
+import importlib
+import glob
 
 # Import approach modules
 from optillm.mcts import chat_with_mcts
@@ -20,7 +22,6 @@
 from optillm.plansearch import plansearch
 from optillm.leap import leap
 from optillm.reread import re2_approach
-from optillm.wim import WiMInfiniteContextAPI
 
 # Setup logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -77,6 +78,23 @@
 known_approaches = ["mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar",
                     "cot_reflection", "plansearch", "leap", "re2", "wim"]
 
+plugin_approaches = {}
+
+def load_plugins():
+    plugin_dir = os.path.join(os.path.dirname(__file__), 'plugins')
+    plugin_files = glob.glob(os.path.join(plugin_dir, '*.py'))
+    
+    for plugin_file in plugin_files:
+        module_name = os.path.basename(plugin_file)[:-3]  # Remove .py extension
+        spec = importlib.util.spec_from_file_location(module_name, plugin_file)
+        module = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(module)
+        
+        if hasattr(module, 'SLUG') and hasattr(module, 'run'):
+            plugin_approaches[module.SLUG] = module.run
+            logger.info(f"Loaded plugin: {module.SLUG}")
+
+
 def generate_streaming_response(final_response, model):
     # Yield the final response
     if isinstance(final_response, list):
@@ -191,9 +209,8 @@ def proxy():
             final_response, completion_tokens = leap(system_prompt, initial_query, client, model)
         elif approach == 're2':
             final_response, completion_tokens = re2_approach(system_prompt, initial_query, client, model, n=n)
-        elif approach == "wim":
-            wim_context = WiMInfiniteContextAPI(system_prompt, client, model, max_context_tokens=64000, max_margins=10, chunk_size=16000)
-            final_response, completion_tokens = wim_context.process_query(initial_query)
+        elif approach in plugin_approaches:
+            final_response, completion_tokens = plugin_approaches[approach](system_prompt, initial_query, client, model)
         else:
             raise ValueError(f"Unknown approach: {approach}")
     except Exception as e:
@@ -320,6 +337,8 @@ def main():
         logger.error(f"Please set the OPENAI_API_KEY environment variable before using the proxy")
         exit(1)
 
+    # Call this function at the start of main()
+    load_plugins()
     # Update server_config with all argument values
     server_config.update(vars(args))
 
diff --git a/optillm/plugins/memory_plugin.py b/optillm/plugins/memory_plugin.py
new file mode 100644
index 00000000..6525dc19
--- /dev/null
+++ b/optillm/plugins/memory_plugin.py
@@ -0,0 +1,103 @@
+import re
+from typing import Tuple, List
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+SLUG = "memory"
+
+class Memory:
+    def __init__(self, max_size: int = 1000):
+        self.max_size = max_size
+        self.items: List[str] = []
+        self.vectorizer = TfidfVectorizer()
+        self.vectors = None
+        self.completion_tokens = 0
+
+    def add(self, item: str):
+        if len(self.items) >= self.max_size:
+            self.items.pop(0)
+        self.items.append(item)
+        self.vectors = None  # Reset vectors to force recalculation
+
+    def get_relevant(self, query: str, n: int = 5) -> List[str]:
+        if not self.items:
+            return []
+
+        if self.vectors is None:
+            self.vectors = self.vectorizer.fit_transform(self.items)
+
+        query_vector = self.vectorizer.transform([query])
+        similarities = cosine_similarity(query_vector, self.vectors).flatten()
+        top_indices = similarities.argsort()[-n:][::-1]
+        
+        return [self.items[i] for i in top_indices]
+
+def extract_query(text: str) -> Tuple[str, str]:
+    query_index = text.rfind("Query:")
+    
+    if query_index != -1:
+        context = text[:query_index].strip()
+        query = text[query_index + 6:].strip()
+    else:
+        sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+        if len(sentences) > 1:
+            context = ' '.join(sentences[:-1])
+            query = sentences[-1]
+        else:
+            context = text
+            query = "What is the main point of this text?"
+    return query, context
+
+def extract_key_information(text: str, client, model: str) -> List[str]:
+    prompt = f"""Extract key information from the following text. Provide a list of important facts or concepts, each on a new line:
+
+{text}
+
+Key information:"""
+
+    response = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=150
+    )
+
+    key_info = response.choices[0].message.content.strip().split('\n')
+    
+    return [info.strip('- ') for info in key_info if info.strip()], response.usage.completion_tokens
+
+def run(system_prompt: str, initial_query: str, client, model: str) -> Tuple[str, int]:
+    memory = Memory()
+    query, context = extract_query(initial_query)
+    completion_tokens = 0
+
+    # Process context and add to memory
+    chunk_size = 1000
+    for i in range(0, len(context), chunk_size):
+        chunk = context[i:i+chunk_size]
+        key_info, tokens = extract_key_information(chunk, client, model)
+        completion_tokens += tokens
+        for info in key_info:
+            memory.add(info)
+
+    # Retrieve relevant information from memory
+    relevant_info = memory.get_relevant(query)
+    
+    # Generate response using relevant information
+    prompt = f"""System: {system_prompt}
+
+Context: {' '.join(relevant_info)}
+
+{query}
+"""
+
+    response = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=500
+    )
+
+    final_response = response.choices[0].message.content.strip()
+    completion_tokens += response.usage.completion_tokens
+
+    return final_response, completion_tokens
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index a6e431ec..c10dbe97 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ torch
 transformers
 azure.identity
 tiktoken
+scikit-learn
\ No newline at end of file
diff --git a/test_cases.json b/test_cases.json
index a9c8df7a..fadf3e08 100644
--- a/test_cases.json
+++ b/test_cases.json
@@ -28,5 +28,10 @@
     "name" : "reddit",
     "system_prompt": "",
     "query" : "There are 24 volunteers. Over the next 3 weeks, each volunteer is assigned to a different task. There are 8 tasks. Each week, the volunteers switch tasks. Each task has 3 volunteers assigned to it. Volunteers cannot be assigned to the same task more than once, and volunteers cannot share the same task more than once."
+  },
+  {
+    "name" : "GH",
+    "system_prompt" : "",
+    "query" : "Find the largest possible real part of[(75+117i)z+\frac{96+144i}{z}]where z is a complex number with |z|=4"
   }
 ]

From e1794f72455817c4b2eb112b48213c06e4dfadd5 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 30 Sep 2024 02:24:23 -0700
Subject: [PATCH 4/9] add readurls plugin

---
 optillm.py                         |  4 +--
 optillm/plugins/readurls_plugin.py | 47 ++++++++++++++++++++++++++++++
 requirements.txt                   |  2 ++
 setup.py                           |  5 ++++
 4 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 optillm/plugins/readurls_plugin.py

diff --git a/optillm.py b/optillm.py
index c0739044..c7f23925 100644
--- a/optillm.py
+++ b/optillm.py
@@ -84,9 +84,9 @@
 plugin_approaches = {}
 
 def load_plugins():
-    plugin_dir = os.path.join(os.path.dirname(__file__), 'plugins')
+    plugin_dir = os.path.join(os.path.dirname(__file__), 'optillm/plugins')
     plugin_files = glob.glob(os.path.join(plugin_dir, '*.py'))
-    
+
     for plugin_file in plugin_files:
         module_name = os.path.basename(plugin_file)[:-3]  # Remove .py extension
         spec = importlib.util.spec_from_file_location(module_name, plugin_file)
diff --git a/optillm/plugins/readurls_plugin.py b/optillm/plugins/readurls_plugin.py
new file mode 100644
index 00000000..74d97549
--- /dev/null
+++ b/optillm/plugins/readurls_plugin.py
@@ -0,0 +1,47 @@
+import re
+from typing import Tuple, List
+import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+
+SLUG = "url_content"
+
+def extract_urls(text: str) -> List[str]:
+    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
+    return url_pattern.findall(text)
+
+def fetch_webpage_content(url: str, max_length: int = 1000) -> str:
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        
+        # Remove script and style elements
+        for script in soup(["script", "style"]):
+            script.decompose()
+        
+        # Get text content
+        text = soup.get_text()
+        
+        # Break into lines and remove leading and trailing space on each
+        lines = (line.strip() for line in text.splitlines())
+        
+        # Break multi-headlines into a line each
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        
+        # Join the lines and truncate if necessary
+        content = ' '.join(chunk for chunk in chunks if chunk)
+        return content[:max_length] + ('...' if len(content) > max_length else '')
+    except Exception as e:
+        return f"Error fetching content: {str(e)}"
+
+def run(system_prompt, initial_query: str, client=None, model=None) -> Tuple[str, int]:
+    urls = extract_urls(initial_query)
+    modified_query = initial_query
+
+    for url in urls:
+        content = fetch_webpage_content(url)
+        domain = urlparse(url).netloc
+        modified_query = modified_query.replace(url, f"{url} [Content from {domain}: {content}]")
+
+    return modified_query, 0
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index a57748f7..09885d7c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,5 @@ azure.identity
 tiktoken
 scikit-learn
 litellm
+requests
+beautifulsoup4
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 61579c36..108548d1 100644
--- a/setup.py
+++ b/setup.py
@@ -14,6 +14,11 @@
         "torch",
         "transformers",
         "azure-identity",
+        "tiktoken",
+        "scikit-learn",
+        "litellm",
+        "requests",
+        "beautifulsoup4",
     ],
     author="codelion",
     author_email="codelion@okyasoft.com",

From 8efd43c1505fd5dd7cfb7993441ec8c4bfa95172 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 30 Sep 2024 03:58:01 -0700
Subject: [PATCH 5/9] fix bugs

---
 optillm.py                       | 148 +++++++++++++++++++++----------
 scripts/eval_frames_benchmark.py | 130 +++++++++++++++++++++++++++
 scripts/requirements.txt         |   1 +
 3 files changed, 232 insertions(+), 47 deletions(-)
 create mode 100644 scripts/eval_frames_benchmark.py
 create mode 100644 scripts/requirements.txt

diff --git a/optillm.py b/optillm.py
index c7f23925..4ab36a0b 100644
--- a/optillm.py
+++ b/optillm.py
@@ -8,6 +8,8 @@
 import json
 import importlib
 import glob
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
 
 # Import the LiteLLM wrapper
 from litellm_wrapper import LiteLLMWrapper
@@ -97,6 +99,94 @@ def load_plugins():
             plugin_approaches[module.SLUG] = module.run
             logger.info(f"Loaded plugin: {module.SLUG}")
 
+def parse_combined_approach(model: str, known_approaches: list, plugin_approaches: dict):
+    if model == 'auto':
+        return 'SINGLE', ['bon'], model
+
+    parts = model.split('-')
+    approaches = []
+    operation = 'SINGLE'
+    model_parts = []
+    parsing_approaches = True
+
+    for part in parts:
+        if parsing_approaches:
+            if part in known_approaches or part in plugin_approaches:
+                approaches.append(part)
+            elif '&' in part:
+                operation = 'AND'
+                approaches.extend(part.split('&'))
+            elif '|' in part:
+                operation = 'OR'
+                approaches.extend(part.split('|'))
+            else:
+                parsing_approaches = False
+                model_parts.append(part)
+        else:
+            model_parts.append(part)
+
+    if not approaches:
+        approaches = ['bon']
+        operation = 'SINGLE'
+
+    actual_model = '-'.join(model_parts)
+
+    return operation, approaches, actual_model
+    
+def execute_single_approach(approach, system_prompt, initial_query, client, model):
+    if approach in known_approaches:
+        # Execute known approaches
+        if approach == 'mcts':
+            return chat_with_mcts(system_prompt, initial_query, client, model, server_config['mcts_simulations'],
+                                            server_config['mcts_exploration'], server_config['mcts_depth'])
+        elif approach == 'bon':
+            return  best_of_n_sampling(system_prompt, initial_query, client, model, server_config['best_of_n'])
+        elif approach == 'moa':
+            return mixture_of_agents(system_prompt, initial_query, client, model)
+        elif approach == 'rto':
+            return round_trip_optimization(system_prompt, initial_query, client, model)
+        elif approach == 'z3':
+            z3_solver = Z3SolverSystem(system_prompt, client, model)
+            return z3_solver.process_query(initial_query)
+        elif approach == "self_consistency":
+            return advanced_self_consistency_approach(system_prompt, initial_query, client, model)
+        elif approach == "pvg":
+            return inference_time_pv_game(system_prompt, initial_query, client, model)
+        elif approach == "rstar":
+            rstar = RStar(system_prompt, client, model,
+                          max_depth=server_config['rstar_max_depth'], num_rollouts=server_config['rstar_num_rollouts'],
+                          c=server_config['rstar_c'])
+            return rstar.solve(initial_query)
+        elif approach == "cot_reflection":
+            return cot_reflection(system_prompt, initial_query, client, model, return_full_response=server_config['return_full_response'])
+        elif approach == 'plansearch':
+            return plansearch(system_prompt, initial_query, client, model, n=n)
+        elif approach == 'leap':
+            return leap(system_prompt, initial_query, client, model)
+        elif approach == 're2':
+            return re2_approach(system_prompt, initial_query, client, model, n=n)
+    elif approach in plugin_approaches:
+        return plugin_approaches[approach](system_prompt, initial_query, client, model)
+    else:
+        raise ValueError(f"Unknown approach: {approach}")
+    
+def execute_combined_approaches(approaches, system_prompt, initial_query, client, model):
+    final_response = initial_query
+    total_tokens = 0
+    for approach in approaches:
+        response, tokens = execute_single_approach(approach, system_prompt, final_response, client, model)
+        final_response = response
+        total_tokens += tokens
+    return final_response, total_tokens
+
+async def execute_parallel_approaches(approaches, system_prompt, initial_query, client, model):
+    async def run_approach(approach):
+        return await asyncio.to_thread(execute_single_approach, approach, system_prompt, initial_query, client, model)
+
+    tasks = [run_approach(approach) for approach in approaches]
+    results = await asyncio.gather(*tasks)
+    responses, tokens = zip(*results)
+    return list(responses), sum(tokens)
 
 def generate_streaming_response(final_response, model):
     # Yield the final response
@@ -167,55 +257,20 @@ def proxy():
     else:
         client = default_client
 
-    # Handle 'auto' approach
-    if approach == 'auto':
-        for known_approach in known_approaches:
-            if model.startswith(f"{known_approach}-"):
-                approach = known_approach
-                model = model[len(known_approach)+1:]
-                break
-        else:
-            # If no known approach is found in the model name, default to 'bon'
-            approach = 'bon'
-
-
-    logger.info(f'Using approach {approach}, with {model}')
-    completion_tokens = 0
+    operation, approaches, model = parse_combined_approach(model, known_approaches, plugin_approaches)
+    logger.info(f'Using approach(es) {approaches}, operation {operation}, with model {model}')
 
     try:
-        if approach == 'mcts':
-            final_response, completion_tokens = chat_with_mcts(system_prompt, initial_query, client, model, server_config['mcts_simulations'],
-                                            server_config['mcts_exploration'], server_config['mcts_depth'])
-        elif approach == 'bon':
-            final_response, completion_tokens = best_of_n_sampling(system_prompt, initial_query, client, model, server_config['best_of_n'])
-        elif approach == 'moa':
-            final_response, completion_tokens = mixture_of_agents(system_prompt, initial_query, client, model)
-        elif approach == 'rto':
-            final_response, completion_tokens = round_trip_optimization(system_prompt, initial_query, client, model)
-        elif approach == 'z3':
-            z3_solver = Z3SolverSystem(system_prompt, client, model)
-            final_response, completion_tokens = z3_solver.process_query(initial_query)
-        elif approach == "self_consistency":
-            final_response, completion_tokens = advanced_self_consistency_approach(system_prompt, initial_query, client, model)
-        elif approach == "pvg":
-            final_response, completion_tokens = inference_time_pv_game(system_prompt, initial_query, client, model)
-        elif approach == "rstar":
-            rstar = RStar(system_prompt, client, model,
-                          max_depth=server_config['rstar_max_depth'], num_rollouts=server_config['rstar_num_rollouts'],
-                          c=server_config['rstar_c'])
-            final_response, completion_tokens = rstar.solve(initial_query)
-        elif approach == "cot_reflection":
-            final_response, completion_tokens = cot_reflection(system_prompt, initial_query, client, model, return_full_response=server_config['return_full_response'])
-        elif approach == 'plansearch':
-            final_response, completion_tokens = plansearch(system_prompt, initial_query, client, model, n=n)
-        elif approach == 'leap':
-            final_response, completion_tokens = leap(system_prompt, initial_query, client, model)
-        elif approach == 're2':
-            final_response, completion_tokens = re2_approach(system_prompt, initial_query, client, model, n=n)
-        elif approach in plugin_approaches:
-            final_response, completion_tokens = plugin_approaches[approach](system_prompt, initial_query, client, model)
+        if operation == 'SINGLE':
+            final_response, completion_tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model)
+        elif operation == 'AND':
+            final_response, completion_tokens = execute_combined_approaches(approaches, system_prompt, initial_query, client, model)
+        elif operation == 'OR':
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            final_response, completion_tokens = loop.run_until_complete(execute_parallel_approaches(approaches, system_prompt, initial_query, client, model))
         else:
-            raise ValueError(f"Unknown approach: {approach}")
+            raise ValueError(f"Unknown operation: {operation}")
     except Exception as e:
         logger.error(f"Error processing request: {str(e)}")
         return jsonify({"error": str(e)}), 500
@@ -254,7 +309,6 @@ def proxy():
     logger.debug(f'API response: {response_data}')
     return jsonify(response_data), 200
 
-
 @app.route('/v1/models', methods=['GET'])
 def proxy_models():
     logger.info('Received request to /v1/models')
diff --git a/scripts/eval_frames_benchmark.py b/scripts/eval_frames_benchmark.py
new file mode 100644
index 00000000..8e0491e2
--- /dev/null
+++ b/scripts/eval_frames_benchmark.py
@@ -0,0 +1,130 @@
+import argparse
+import json
+import os
+import time
+from typing import List, Dict
+
+from openai import OpenAI
+from datasets import load_dataset
+from tqdm import tqdm
+
+client = OpenAI(api_key="none", base_url="http://localhost:8000/v1")
+SLEEP_INTERVAL = 10
+
+def generate_llm_prompt(prompt: str, wiki_links: List[str]) -> str:
+    wiki_links_str = "\n".join(wiki_links)
+    return f"{prompt}\n\nHere are the relevant Wikipedia articles:\n{wiki_links_str}\n\nBased on all this, answer the query."
+
+def get_llm_response(prompt: str, model: str) -> str:
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ],
+        max_tokens=1000,
+        n=1,
+        stop=None,
+        temperature=0.7,
+    )
+    return response.choices[0].message.content.strip()
+
+def evaluate_response(question: str, llm_response: str, ground_truth: str, model: str) -> Dict[str, str]:
+    evaluation_prompt = f"""===Task===
+I need your help in evaluating an answer provided by an LLM against a ground
+truth answer. Your task is to determine if the ground truth answer is present in the LLM's
+response. Please analyze the provided data and make a decision.
+===Instructions===
+1. Carefully compare the "Predicted Answer" with the "Ground Truth Answer".
+2. Consider the substance of the answers – look for equivalent information or correct answers.
+Do not focus on exact wording unless the exact wording is crucial to the meaning.
+3. Your final decision should be based on whether the meaning and the vital facts of the
+"Ground Truth Answer" are present in the "Predicted Answer:"
+===Input Data===
+- Question: {question}
+- Predicted Answer: {llm_response}
+- Ground Truth Answer: {ground_truth}
+===Output Format===
+Provide your final evaluation in the following format:
+"Explanation:" (How you made the decision?)
+"Decision:" ("TRUE" or "FALSE" )
+Please proceed with the evaluation."""
+
+    evaluation_response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": evaluation_prompt}
+        ],
+        max_tokens=300,
+        n=1,
+        stop=None,
+        temperature=0.3,
+    )
+    
+    evaluation_text = evaluation_response.choices[0].message.content.strip()
+    
+    # Extract the decision and explanation
+    lines = evaluation_text.split('\n')
+    decision = "FALSE"
+    explanation = ""
+    for line in lines:
+        if line.startswith("Decision:"):
+            decision = line.split(":")[1].strip().upper()
+        elif line.startswith("Explanation:"):
+            explanation = line.split(":", 1)[1].strip()
+    
+    return {"decision": decision, "explanation": explanation}
+
+def main(model: str):
+    
+    # Load the dataset
+    dataset = load_dataset("google/frames-benchmark", split="test")
+    
+    results = []
+    
+    for item in tqdm(dataset, desc="Processing samples"):
+        # print(item)
+        prompt = generate_llm_prompt(item['Prompt'], item['wiki_links'])
+        llm_response = get_llm_response(prompt, model)
+        evaluation = evaluate_response(item['Prompt'], llm_response, item['Answer'], model)
+        
+        result = {
+            "prompt": item['Prompt'],
+            "ground_truth": item['Answer'],
+            "llm_response": llm_response,
+            "evaluation_decision": evaluation['decision'],
+            "evaluation_explanation": evaluation['explanation'],
+            "reasoning_type": item['reasoning_types']
+        }
+        results.append(result)
+        time.sleep(SLEEP_INTERVAL)
+
+    # Save results to a JSON file
+    with open(f"evaluation_results_{model.replace('/', '_')}.json", "w") as f:
+        json.dump(results, f, indent=2)
+    
+    # Calculate and print summary statistics
+    total_samples = len(results)
+    correct_answers = sum(1 for r in results if r['evaluation_decision'] == 'TRUE')
+    accuracy = correct_answers / total_samples
+    
+    print(f"Model: {model}")
+    print(f"Total samples: {total_samples}")
+    print(f"Correct answers: {correct_answers}")
+    print(f"Accuracy: {accuracy:.2%}")
+    
+    # Print accuracy by reasoning type
+    reasoning_types = set(r['reasoning_types'] for r in results)
+    for rt in reasoning_types:
+        rt_samples = [r for r in results if r['reasoning_types'] == rt]
+        rt_correct = sum(1 for r in rt_samples if r['evaluation_decision'] == 'TRUE')
+        rt_accuracy = rt_correct / len(rt_samples)
+        print(f"Accuracy for {rt}: {rt_accuracy:.2%}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Evaluate LLM performance on google/frames-benchmark")
+    parser.add_argument("--model", type=str, required=True, help="OpenAI model to use (e.g., gpt-4o, gpt-4o-mini)")
+    args = parser.parse_args()
+    
+    main(args.model)
\ No newline at end of file
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
new file mode 100644
index 00000000..aee11b28
--- /dev/null
+++ b/scripts/requirements.txt
@@ -0,0 +1 @@
+datasets

From 0bda103947f6235011891806bac420773c10be14 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 30 Sep 2024 14:22:03 -0700
Subject: [PATCH 6/9] Fix bugs to get the eval script to run

---
 optillm.py                         |  5 +--
 optillm/plugins/memory_plugin.py   |  8 ++--
 optillm/plugins/readurls_plugin.py | 65 +++++++++++++++++++++++-------
 requirements.txt                   |  3 +-
 scripts/eval_frames_benchmark.py   |  6 +--
 setup.py                           |  1 +
 6 files changed, 62 insertions(+), 26 deletions(-)

diff --git a/optillm.py b/optillm.py
index 4ab36a0b..656d7791 100644
--- a/optillm.py
+++ b/optillm.py
@@ -160,11 +160,11 @@ def execute_single_approach(approach, system_prompt, initial_query, client, mode
         elif approach == "cot_reflection":
             return cot_reflection(system_prompt, initial_query, client, model, return_full_response=server_config['return_full_response'])
         elif approach == 'plansearch':
-            return plansearch(system_prompt, initial_query, client, model, n=n)
+            return plansearch(system_prompt, initial_query, client, model, n=server_config['n'])
         elif approach == 'leap':
             return leap(system_prompt, initial_query, client, model)
         elif approach == 're2':
-            return re2_approach(system_prompt, initial_query, client, model, n=n)
+            return re2_approach(system_prompt, initial_query, client, model, n=server_config['n'])
     elif approach in plugin_approaches:
         return plugin_approaches[approach](system_prompt, initial_query, client, model)
     else:
@@ -249,7 +249,6 @@ def proxy():
 
     system_prompt, initial_query = parse_conversation(messages)
 
-    approach = server_config['approach']
     base_url = server_config['base_url']
 
     if base_url != "":
diff --git a/optillm/plugins/memory_plugin.py b/optillm/plugins/memory_plugin.py
index 6525dc19..00d2bade 100644
--- a/optillm/plugins/memory_plugin.py
+++ b/optillm/plugins/memory_plugin.py
@@ -7,7 +7,7 @@
 SLUG = "memory"
 
 class Memory:
-    def __init__(self, max_size: int = 1000):
+    def __init__(self, max_size: int = 100):
         self.max_size = max_size
         self.items: List[str] = []
         self.vectorizer = TfidfVectorizer()
@@ -59,7 +59,7 @@ def extract_key_information(text: str, client, model: str) -> List[str]:
     response = client.chat.completions.create(
         model=model,
         messages=[{"role": "user", "content": prompt}],
-        max_tokens=150
+        max_tokens=1000
     )
 
     key_info = response.choices[0].message.content.strip().split('\n')
@@ -72,7 +72,7 @@ def run(system_prompt: str, initial_query: str, client, model: str) -> Tuple[str
     completion_tokens = 0
 
     # Process context and add to memory
-    chunk_size = 1000
+    chunk_size = 10000
     for i in range(0, len(context), chunk_size):
         chunk = context[i:i+chunk_size]
         key_info, tokens = extract_key_information(chunk, client, model)
@@ -94,7 +94,7 @@ def run(system_prompt: str, initial_query: str, client, model: str) -> Tuple[str
     response = client.chat.completions.create(
         model=model,
         messages=[{"role": "user", "content": prompt}],
-        max_tokens=500
+        max_tokens=1000
     )
 
     final_response = response.choices[0].message.content.strip()
diff --git a/optillm/plugins/readurls_plugin.py b/optillm/plugins/readurls_plugin.py
index 74d97549..bffeb72b 100644
--- a/optillm/plugins/readurls_plugin.py
+++ b/optillm/plugins/readurls_plugin.py
@@ -7,41 +7,76 @@
 SLUG = "url_content"
 
 def extract_urls(text: str) -> List[str]:
-    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
-    return url_pattern.findall(text)
+    # Updated regex pattern to be more precise
+    url_pattern = re.compile(r'https?://[^\s\'"]+')
+    
+    # Find all matches
+    urls = url_pattern.findall(text)
+    
+    # Clean up the URLs
+    cleaned_urls = []
+    for url in urls:
+        # Remove trailing punctuation and quotes
+        url = re.sub(r'[,\'\"\)\]]+$', '', url)
+        cleaned_urls.append(url)
+    
+    return cleaned_urls
 
-def fetch_webpage_content(url: str, max_length: int = 1000) -> str:
+def fetch_webpage_content(url: str, max_length: int = 40000) -> str:
     try:
-        response = requests.get(url, timeout=10)
+        headers = {
+            'User-Agent': 'optillm/0.0.1 (hhttps://github.com/codelion/optillm)'
+        }
+        
+        response = requests.get(url, headers=headers, timeout=10)
         response.raise_for_status()
-        soup = BeautifulSoup(response.content, 'html.parser')
+        
+        # Make a soup 
+        soup = BeautifulSoup(response.content, 'lxml')
         
         # Remove script and style elements
         for script in soup(["script", "style"]):
             script.decompose()
         
-        # Get text content
-        text = soup.get_text()
+        # Get text from various elements
+        text_elements = []
+        
+        # Prioritize content from main content tags
+        for tag in ['article', 'main', 'div[role="main"]', '.main-content']:
+            content = soup.select_one(tag)
+            if content:
+                text_elements.extend(content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']))
+                break
         
-        # Break into lines and remove leading and trailing space on each
-        lines = (line.strip() for line in text.splitlines())
+        # If no main content found, fall back to all headers and paragraphs
+        if not text_elements:
+            text_elements = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'])
         
-        # Break multi-headlines into a line each
-        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        # Extract text from elements
+        text = ' '.join(element.get_text(strip=True) for element in text_elements)
         
-        # Join the lines and truncate if necessary
-        content = ' '.join(chunk for chunk in chunks if chunk)
-        return content[:max_length] + ('...' if len(content) > max_length else '')
+        # Remove extra whitespace
+        text = re.sub(r'\s+', ' ', text).strip()
+        
+        # Remove footnote superscripts in brackets
+        text = re.sub(r"\[.*?\]+", '', text)
+        
+        # Truncate to max_length
+        if len(text) > max_length:
+            text = text[:max_length] + '...'
+        
+        return text
     except Exception as e:
         return f"Error fetching content: {str(e)}"
 
 def run(system_prompt, initial_query: str, client=None, model=None) -> Tuple[str, int]:
     urls = extract_urls(initial_query)
+    # print(urls)
     modified_query = initial_query
 
     for url in urls:
         content = fetch_webpage_content(url)
         domain = urlparse(url).netloc
         modified_query = modified_query.replace(url, f"{url} [Content from {domain}: {content}]")
-
+    # print(modified_query)
     return modified_query, 0
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 09885d7c..47c69eb5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,5 @@ tiktoken
 scikit-learn
 litellm
 requests
-beautifulsoup4
\ No newline at end of file
+beautifulsoup4
+lxml
\ No newline at end of file
diff --git a/scripts/eval_frames_benchmark.py b/scripts/eval_frames_benchmark.py
index 8e0491e2..f3804511 100644
--- a/scripts/eval_frames_benchmark.py
+++ b/scripts/eval_frames_benchmark.py
@@ -9,11 +9,10 @@
 from tqdm import tqdm
 
 client = OpenAI(api_key="none", base_url="http://localhost:8000/v1")
-SLEEP_INTERVAL = 10
+SLEEP_INTERVAL = 30
 
 def generate_llm_prompt(prompt: str, wiki_links: List[str]) -> str:
-    wiki_links_str = "\n".join(wiki_links)
-    return f"{prompt}\n\nHere are the relevant Wikipedia articles:\n{wiki_links_str}\n\nBased on all this, answer the query."
+    return f"Here are the relevant Wikipedia articles:\n{wiki_links}\n\nBased on all the information, answer the query. \n\nQuery: {prompt}\n\n"
 
 def get_llm_response(prompt: str, model: str) -> str:
     response = client.chat.completions.create(
@@ -98,6 +97,7 @@ def main(model: str):
             "reasoning_type": item['reasoning_types']
         }
         results.append(result)
+        print(result["evaluation_decision"])
         time.sleep(SLEEP_INTERVAL)
 
     # Save results to a JSON file
diff --git a/setup.py b/setup.py
index 108548d1..8097c206 100644
--- a/setup.py
+++ b/setup.py
@@ -19,6 +19,7 @@
         "litellm",
         "requests",
         "beautifulsoup4",
+        "lxml",
     ],
     author="codelion",
     author_email="codelion@okyasoft.com",

From b18e64a52979ba3ce7aa9f2616bd7f16a316783c Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Mon, 30 Sep 2024 21:14:47 -0700
Subject: [PATCH 7/9] fix

---
 optillm/plugins/readurls_plugin.py |  2 +-
 scripts/eval_frames_benchmark.py   | 42 +++++++++++++++++++++++-------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/optillm/plugins/readurls_plugin.py b/optillm/plugins/readurls_plugin.py
index bffeb72b..4392f2c4 100644
--- a/optillm/plugins/readurls_plugin.py
+++ b/optillm/plugins/readurls_plugin.py
@@ -4,7 +4,7 @@
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 
-SLUG = "url_content"
+SLUG = "readurls"
 
 def extract_urls(text: str) -> List[str]:
     # Updated regex pattern to be more precise
diff --git a/scripts/eval_frames_benchmark.py b/scripts/eval_frames_benchmark.py
index f3804511..16c44c09 100644
--- a/scripts/eval_frames_benchmark.py
+++ b/scripts/eval_frames_benchmark.py
@@ -9,7 +9,26 @@
 from tqdm import tqdm
 
 client = OpenAI(api_key="none", base_url="http://localhost:8000/v1")
-SLEEP_INTERVAL = 30
+SLEEP_INTERVAL = 60
+
+def load_existing_results(filename: str) -> List[Dict]:
+    try:
+        with open(filename, 'r') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        return []
+
+def save_result(filename: str, result: Dict):
+    results = load_existing_results(filename)
+    results.append(result)
+    with open(filename, 'w') as f:
+        json.dump(results, f, indent=2)
+
+def get_last_processed_index(results: List[Dict]) -> int:
+    if not results:
+        return -1
+    return max(int(r.get('index', -1)) for r in results)
+
 
 def generate_llm_prompt(prompt: str, wiki_links: List[str]) -> str:
     return f"Here are the relevant Wikipedia articles:\n{wiki_links}\n\nBased on all the information, answer the query. \n\nQuery: {prompt}\n\n"
@@ -76,19 +95,24 @@ def evaluate_response(question: str, llm_response: str, ground_truth: str, model
     return {"decision": decision, "explanation": explanation}
 
 def main(model: str):
-    
     # Load the dataset
     dataset = load_dataset("google/frames-benchmark", split="test")
     
-    results = []
+    filename = f"evaluation_results_{model.replace('/', '_')}.json"
+    existing_results = load_existing_results(filename)
+    last_processed_index = get_last_processed_index(existing_results)
     
     for item in tqdm(dataset, desc="Processing samples"):
-        # print(item)
+        index = int(item['Unnamed: 0'])
+        if index <= last_processed_index:
+            continue
+        
         prompt = generate_llm_prompt(item['Prompt'], item['wiki_links'])
         llm_response = get_llm_response(prompt, model)
         evaluation = evaluate_response(item['Prompt'], llm_response, item['Answer'], model)
         
         result = {
+            "index": index,
             "prompt": item['Prompt'],
             "ground_truth": item['Answer'],
             "llm_response": llm_response,
@@ -96,15 +120,13 @@ def main(model: str):
             "evaluation_explanation": evaluation['explanation'],
             "reasoning_type": item['reasoning_types']
         }
-        results.append(result)
-        print(result["evaluation_decision"])
+        
+        save_result(filename, result)
+        print(f"Index: {index}, Decision: {result['evaluation_decision']}")
         time.sleep(SLEEP_INTERVAL)
 
-    # Save results to a JSON file
-    with open(f"evaluation_results_{model.replace('/', '_')}.json", "w") as f:
-        json.dump(results, f, indent=2)
-    
     # Calculate and print summary statistics
+    results = load_existing_results(filename)
     total_samples = len(results)
     correct_answers = sum(1 for r in results if r['evaluation_decision'] == 'TRUE')
     accuracy = correct_answers / total_samples

From 6604dce4c35b97ffb4cedfaacb13ec8331030467 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Tue, 1 Oct 2024 02:04:33 -0700
Subject: [PATCH 8/9] support extra_body and prompt config

---
 optillm.py                     | 34 +++++++++---
 scripts/gen_optillm_dataset.py | 96 ++++++++++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 6 deletions(-)
 create mode 100644 scripts/gen_optillm_dataset.py

diff --git a/optillm.py b/optillm.py
index 656d7791..ce79e70e 100644
--- a/optillm.py
+++ b/optillm.py
@@ -9,6 +9,7 @@
 import importlib
 import glob
 import asyncio
+import re
 from concurrent.futures import ThreadPoolExecutor
 
 # Import the LiteLLM wrapper
@@ -208,18 +209,31 @@ def generate_streaming_response(final_response, model):
 def parse_conversation(messages):
     system_prompt = ""
     conversation = []
+    optillm_approach = None
     
     for message in messages:
         role = message['role']
         content = message['content']
         
         if role == 'system':
-            system_prompt = content
-        elif role in ['user', 'assistant']:
-            conversation.append(f"{role.capitalize()}: {content}")
+            system_prompt, optillm_approach = extract_optillm_approach(content)
+        elif role == 'user':
+            if not optillm_approach:
+                content, optillm_approach = extract_optillm_approach(content)
+            conversation.append(f"User: {content}")
+        elif role == 'assistant':
+            conversation.append(f"Assistant: {content}")
     
     initial_query = "\n".join(conversation)
-    return system_prompt, initial_query
+    return system_prompt, initial_query, optillm_approach
+
+def extract_optillm_approach(content):
+    match = re.search(r'<optillm_approach>(.*?)</optillm_approach>', content)
+    if match:
+        approach = match.group(1)
+        content = re.sub(r'<optillm_approach>.*?</optillm_approach>', '', content).strip()
+        return content, approach
+    return content, None
 
 # Optional API key configuration to secure the proxy
 @app.before_request
@@ -245,9 +259,17 @@ def proxy():
     stream = data.get('stream', False)
     messages = data.get('messages', [])
     model = data.get('model', server_config['model'])
-    n = data.get('n', server_config['n'])
 
-    system_prompt, initial_query = parse_conversation(messages)
+    optillm_approach = data.get('optillm_approach', {})
+
+    system_prompt, initial_query, message_optillm_approach = parse_conversation(messages)
+
+    # Use optillm_approach from extra_body if present, otherwise use from messages
+    if not optillm_approach and message_optillm_approach:
+        optillm_approach = message_optillm_approach
+
+    if optillm_approach:
+        model = f"{optillm_approach}-{model}"
 
     base_url = server_config['base_url']
 
diff --git a/scripts/gen_optillm_dataset.py b/scripts/gen_optillm_dataset.py
new file mode 100644
index 00000000..9a3b3cd4
--- /dev/null
+++ b/scripts/gen_optillm_dataset.py
@@ -0,0 +1,96 @@
+import os
+import json
+import argparse
+import asyncio
+from tqdm import tqdm
+from datasets import load_dataset
+from openai import AsyncOpenAI
+from typing import List, Dict, Any
+import random
+
+# OptILM approaches
+APPROACHES = ["none", "mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar", "cot_reflection", "plansearch", "leap", "re2"]
+
+async def generate_response(prompt: str, approach: str) -> Dict[str, Any]:
+    """Generate a response using the specified approach."""
+    if approach == "none":
+        # Use the base model without any optimization technique
+        client = AsyncOpenAI()
+        response = await client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return {
+            "content": response.choices[0].message.content,
+            "tokens": response.usage.completion_tokens,
+        }
+    else:
+        # Use OptILM with the specified approach
+        client = AsyncOpenAI(api_key="none", base_url="http://localhost:8000/v1")
+        response = await client.chat.completions.create(
+            model=f"{approach}-gpt-4o-mini",  # Assuming OptILM uses this naming convention
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return {
+            "content": response.choices[0].message.content,
+            "tokens": response.usage.completion_tokens,
+        }
+
+async def rank_responses(prompt: str, responses: List[Dict[str, Any]]) -> List[int]:
+    """Rank the responses using the LLM."""
+    ranking_prompt = f"Given the following prompt:\n\n{prompt}\n\nRank the following responses from best to worst, considering accuracy, completeness, and relevance. Provide the ranking as a comma-separated list of indices (0-indexed). Do not add any explanations or any other text other than the comma-separated list.\n\n"
+    for i, response in enumerate(responses):
+        ranking_prompt += f"Response {i}:\n{response['content']}\n\n"
+    client = AsyncOpenAI()
+    ranking_response = await client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[{"role": "user", "content": ranking_prompt}],
+    )
+    
+    ranking_str = ranking_response.choices[0].message.content.strip()
+    print(ranking_str)
+    return [int(idx) for idx in ranking_str.split(",")]
+
+async def process_sample(sample: Dict[str, Any]) -> Dict[str, Any]:
+    """Process a single sample from the dataset."""
+    prompt = sample["turns"][0]["content"]
+    results = []
+
+    # Generate responses for each approach
+    for approach in APPROACHES:
+        response = await generate_response(prompt, approach)
+        results.append({"approach": approach, **response})
+
+    random.shuffle(results)
+    # Rank the responses
+    rankings = await rank_responses(prompt, results)
+
+    # Add rankings to results
+    for rank, idx in enumerate(rankings):
+        results[idx]["rank"] = rank
+
+    return {
+        "prompt": prompt,
+        "results": results,
+    }
+
+async def generate_dataset(num_samples: int, output_file: str):
+    """Generate the dataset and save it to a JSONL file."""
+    dataset = load_dataset("lmsys/arena-hard-auto-v0.1", split="train")
+    
+    with open(output_file, "w") as f:
+        for sample in tqdm(dataset.select(range(num_samples)), total=num_samples):
+            result = await process_sample(sample)
+            f.write(json.dumps(result) + "\n")
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate OptILM dataset")
+    parser.add_argument("--num_samples", type=int, default=100, help="Number of samples to process")
+    parser.add_argument("--output_file", type=str, default="optillm_dataset.jsonl", help="Output file path")
+    args = parser.parse_args()
+
+    asyncio.run(generate_dataset(args.num_samples, args.output_file))
+    print(f"Dataset generated and saved to {args.output_file}")
+
+if __name__ == "__main__":
+    main()

From 4083dc640d712051d5eb680459c06b5b423d2015 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Tue, 1 Oct 2024 02:05:47 -0700
Subject: [PATCH 9/9] Update optillm.py

---
 optillm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optillm.py b/optillm.py
index ce79e70e..de767acd 100644
--- a/optillm.py
+++ b/optillm.py
@@ -82,7 +82,7 @@
 
 # List of known approaches
 known_approaches = ["mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar",
-                    "cot_reflection", "plansearch", "leap", "re2", "wim"]
+                    "cot_reflection", "plansearch", "leap", "re2"]
 
 plugin_approaches = {}