From 791df9488e308ef41a0517a5ad7355f3d032f500 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sat, 19 Jul 2025 17:45:07 +0800
Subject: [PATCH 01/10] Add GenSelect plugin for solution selection

Introduces the GenSelect plugin implementing generative solution selection based on the AIMO-2 winning solution paper. Updates README with plugin documentation and reference, bumps version to 0.1.23, and adds a GenSelect math test case.
---
 README.md                           |   2 +
 optillm/plugins/genselect_plugin.py | 269 ++++++++++++++++++++++++++++
 pyproject.toml                      |   2 +-
 test_cases.json                     |   5 +
 4 files changed, 277 insertions(+), 1 deletion(-)
 create mode 100644 optillm/plugins/genselect_plugin.py

diff --git a/README.md b/README.md
index 5dd4069c..06769407 100644
--- a/README.md
+++ b/README.md
@@ -377,6 +377,7 @@ Check this log file for connection issues, tool execution errors, and other diag
 | Read URLs               | `readurls`         | Reads all URLs found in the request, fetches the content at the URL and adds it to the context |
 | Execute Code            | `executecode`      | Enables use of code interpreter to execute python code in requests and LLM generated responses |
 | JSON                    | `json`             | Enables structured outputs using the outlines library, supports pydantic types and JSON schema |
+| GenSelect               | `genselect`        | Generative Solution Selection - generates multiple candidates and selects the best based on quality criteria |
 
 ## Available parameters
 
@@ -587,6 +588,7 @@ called patchflows. We saw huge performance gains across all the supported patchf
 - [Unsupervised Evaluation of Code LLMs with Round-Trip Correctness](https://arxiv.org/abs/2402.08699) - [Inspired the implementation of rto](optillm/rto.py)
 - [Patched MOA: optimizing inference for diverse software development tasks](https://arxiv.org/abs/2407.18521) - [Implementation](optillm/moa.py)
 - [Patched RTC: evaluating LLMs for diverse software development tasks](https://arxiv.org/abs/2407.16557) - [Implementation](ptillm/rto.py)
+- [AIMO-2 Winning Solution: Building State-of-the-Art Mathematical Reasoning Models with OpenMathReasoning dataset](https://arxiv.org/abs/2504.16891) - [Implementation](optillm/plugins/genselect_plugin.py)
 
 ## Citation
 
diff --git a/optillm/plugins/genselect_plugin.py b/optillm/plugins/genselect_plugin.py
new file mode 100644
index 00000000..753f927e
--- /dev/null
+++ b/optillm/plugins/genselect_plugin.py
@@ -0,0 +1,269 @@
+"""
+GenSelect Plugin for OptILLM
+
+This plugin implements the Generative Solution Selection (GenSelect) approach from
+the paper "AIMO-2 Winning Solution: Building State-of-the-Art Mathematical Reasoning
+Models with OpenMathReasoning dataset" (arXiv:2504.16891).
+
+GenSelect generates multiple candidate solutions and uses an LLM to compare and
+select the best one based on quality criteria. Unlike majority voting which counts
+answer frequencies, GenSelect evaluates the entire response quality.
+"""
+
+import logging
+from typing import Tuple, Dict, Any, List, Optional
+import json
+
+logger = logging.getLogger(__name__)
+
+# Plugin identifier
+SLUG = "genselect"
+
+# Default configuration
+DEFAULT_NUM_CANDIDATES = 4
+DEFAULT_TEMPERATURE = 0.7
+DEFAULT_COMPARISON_TEMPERATURE = 0.3
+DEFAULT_COMPARISON_MODE = "batch"  # "batch" or "tournament"
+DEFAULT_INCLUDE_REASONING = False
+
+def create_comparison_prompt(candidates: List[str], query: str, comparison_mode: str = "batch") -> str:
+    """
+    Create a prompt for comparing candidate solutions.
+    
+    Args:
+        candidates: List of candidate responses
+        query: The original user query
+        comparison_mode: "batch" for all at once, "tournament" for pairwise
+        
+    Returns:
+        The comparison prompt
+    """
+    if comparison_mode == "batch":
+        prompt = f"""You are an expert evaluator tasked with selecting the best response to the following query:
+
+Query: {query}
+
+I will provide you with {len(candidates)} different candidate responses. Please analyze each one carefully and select the best response based on the following criteria:
+
+1. **Correctness and Accuracy**: Is the response factually correct and accurate?
+2. **Completeness**: Does it fully address all aspects of the query?
+3. **Clarity**: Is the explanation clear and easy to understand?
+4. **Logical Coherence**: Is the reasoning sound and well-structured?
+5. **Practical Value**: Does it provide useful, actionable information?
+
+For coding problems, also consider:
+- Code correctness and efficiency
+- Best practices and style
+- Error handling
+
+Here are the candidate responses:
+
+"""
+        for i, candidate in enumerate(candidates, 1):
+            prompt += f"=== Candidate {i} ===\n{candidate}\n\n"
+        
+        prompt += """Please analyze all candidates and provide:
+1. A brief comparison highlighting the strengths and weaknesses of each candidate
+2. Your selection of the best candidate (specify the number)
+3. A clear explanation of why you selected that candidate
+
+Format your response as:
+COMPARISON:
+[Your comparison analysis]
+
+BEST CANDIDATE: [number]
+
+REASONING:
+[Your explanation for the selection]"""
+    
+    else:  # tournament mode - for future enhancement
+        # This would implement pairwise comparisons
+        # For now, we'll use batch mode as default
+        return create_comparison_prompt(candidates, query, "batch")
+    
+    return prompt
+
+def parse_selection_response(response: str, num_candidates: int) -> Tuple[int, str]:
+    """
+    Parse the selection response to extract the chosen candidate number and reasoning.
+    
+    Args:
+        response: The LLM's comparison response
+        num_candidates: Total number of candidates
+        
+    Returns:
+        Tuple of (selected_index, reasoning)
+    """
+    import re
+    
+    # Look for "BEST CANDIDATE: X" pattern
+    match = re.search(r'BEST CANDIDATE:\s*(\d+)', response, re.IGNORECASE)
+    if match:
+        candidate_num = int(match.group(1))
+        # Convert to 0-based index
+        selected_index = candidate_num - 1
+        
+        # Validate the selection
+        if 0 <= selected_index < num_candidates:
+            # Extract reasoning if available
+            reasoning_match = re.search(r'REASONING:\s*(.+)', response, re.IGNORECASE | re.DOTALL)
+            reasoning = reasoning_match.group(1).strip() if reasoning_match else "No explicit reasoning provided"
+            
+            logger.info(f"Selected candidate {candidate_num} based on comparison")
+            return selected_index, reasoning
+    
+    # Fallback: Look for other patterns like "Candidate X is the best"
+    patterns = [
+        r'[Cc]andidate\s+(\d+)\s+is\s+(?:the\s+)?best',
+        r'[Ii]\s+(?:would\s+)?select\s+[Cc]andidate\s+(\d+)',
+        r'[Tt]he\s+best\s+(?:response|candidate)\s+is\s+(?:number\s+)?(\d+)',
+    ]
+    
+    for pattern in patterns:
+        match = re.search(pattern, response)
+        if match:
+            candidate_num = int(match.group(1))
+            selected_index = candidate_num - 1
+            if 0 <= selected_index < num_candidates:
+                logger.info(f"Selected candidate {candidate_num} using fallback pattern")
+                return selected_index, "Selection extracted from response pattern"
+    
+    # If no clear selection found, log warning and return first candidate
+    logger.warning("Could not parse selection from comparison response, defaulting to first candidate")
+    return 0, "Failed to parse selection, defaulted to first candidate"
+
+def run(
+    system_prompt: str,
+    initial_query: str,
+    client,
+    model: str,
+    request_config: Dict[str, Any] = None
+) -> Tuple[str, int]:
+    """
+    Main entry point for the GenSelect plugin.
+    
+    Generates multiple candidate solutions and uses LLM comparison to select the best one.
+    
+    Args:
+        system_prompt: System prompt for the model
+        initial_query: User's query
+        client: OpenAI-compatible client instance
+        model: Model identifier
+        request_config: Additional configuration parameters
+        
+    Returns:
+        Tuple of (response_text, completion_tokens_used)
+    """
+    logger.info("Starting GenSelect process")
+    
+    # Extract configuration
+    config = request_config or {}
+    num_candidates = config.get('num_candidates', DEFAULT_NUM_CANDIDATES)
+    temperature = config.get('temperature', DEFAULT_TEMPERATURE)
+    comparison_temperature = config.get('comparison_temperature', DEFAULT_COMPARISON_TEMPERATURE)
+    comparison_mode = config.get('comparison_mode', DEFAULT_COMPARISON_MODE)
+    include_reasoning = config.get('include_reasoning', DEFAULT_INCLUDE_REASONING)
+    max_tokens = config.get('max_tokens', 4096)
+    
+    # Validate num_candidates is in reasonable range (2-16 as per paper)
+    num_candidates = max(2, min(16, num_candidates))
+    
+    logger.info(f"Generating {num_candidates} candidates with temperature={temperature}")
+    
+    # Prepare messages for candidate generation
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": initial_query}
+    ]
+    
+    candidates = []
+    total_tokens = 0
+    
+    try:
+        # Try to generate candidates using n parameter for efficiency
+        response = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            n=num_candidates,
+            temperature=temperature,
+            max_tokens=max_tokens
+        )
+        
+        candidates = [choice.message.content for choice in response.choices]
+        total_tokens += response.usage.completion_tokens
+        
+        logger.info(f"Generated {len(candidates)} candidates using n parameter. Tokens: {total_tokens}")
+        
+    except Exception as e:
+        logger.warning(f"n parameter not supported: {str(e)}")
+        logger.info("Falling back to sequential generation")
+        
+        # Fallback: Generate candidates one by one
+        for i in range(num_candidates):
+            try:
+                response = client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    temperature=temperature,
+                    max_tokens=max_tokens
+                )
+                candidates.append(response.choices[0].message.content)
+                total_tokens += response.usage.completion_tokens
+                logger.debug(f"Generated candidate {i+1}/{num_candidates}")
+                
+            except Exception as gen_error:
+                logger.error(f"Error generating candidate {i+1}: {str(gen_error)}")
+                continue
+    
+    if len(candidates) < 2:
+        logger.error(f"Insufficient candidates generated ({len(candidates)})")
+        if candidates:
+            return candidates[0], total_tokens
+        return "Error: Could not generate sufficient candidates for selection", total_tokens
+    
+    # Create comparison prompt
+    comparison_prompt = create_comparison_prompt(candidates, initial_query, comparison_mode)
+    
+    # Get LLM to compare and select
+    logger.info("Comparing candidates for selection")
+    
+    try:
+        comparison_messages = [
+            {"role": "system", "content": "You are an expert evaluator skilled at comparing and selecting high-quality responses."},
+            {"role": "user", "content": comparison_prompt}
+        ]
+        
+        comparison_response = client.chat.completions.create(
+            model=model,
+            messages=comparison_messages,
+            temperature=comparison_temperature,
+            max_tokens=2048  # Comparison doesn't need as many tokens
+        )
+        
+        selection_response = comparison_response.choices[0].message.content
+        total_tokens += comparison_response.usage.completion_tokens
+        
+        # Parse the selection
+        selected_index, reasoning = parse_selection_response(selection_response, len(candidates))
+        
+        # Get the selected candidate
+        selected_candidate = candidates[selected_index]
+        
+        logger.info(f"GenSelect Summary:")
+        logger.info(f"  - Generated {len(candidates)} candidates")
+        logger.info(f"  - Selected candidate {selected_index + 1}")
+        logger.info(f"  - Total tokens used: {total_tokens}")
+        
+        # Optionally include reasoning in the response
+        if include_reasoning:
+            final_response = f"{selected_candidate}\n\n---\n**GenSelect Reasoning**: {reasoning}"
+        else:
+            final_response = selected_candidate
+        
+        return final_response, total_tokens
+        
+    except Exception as e:
+        logger.error(f"Error during comparison: {str(e)}")
+        # Fallback to first candidate
+        logger.warning("Falling back to first candidate due to comparison error")
+        return candidates[0], total_tokens
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 3ddb789b..5c163015 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "optillm"
-version = "0.1.22"
+version = "0.1.23"
 description = "An optimizing inference proxy for LLMs."
 readme = "README.md"
 license = "Apache-2.0"
diff --git a/test_cases.json b/test_cases.json
index fadf3e08..e557d628 100644
--- a/test_cases.json
+++ b/test_cases.json
@@ -33,5 +33,10 @@
     "name" : "GH",
     "system_prompt" : "",
     "query" : "Find the largest possible real part of[(75+117i)z+\frac{96+144i}{z}]where z is a complex number with |z|=4"
+  },
+  {
+    "name": "GenSelect Math",
+    "system_prompt": "You are a helpful AI assistant with expertise in mathematical reasoning.",
+    "query": "A farmer has 17 sheep. All but 9 die. How many sheep does the farmer have left? Explain your reasoning step by step."
   }
 ]

From 6baa4bcf411890d541b049c736adf9095e1e911a Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sat, 19 Jul 2025 17:52:06 +0800
Subject: [PATCH 02/10] Add default test-time compute evaluation modes

Introduces a default test-time compute configuration with pass@1, maj@64, and genselect@64 approaches for standard evaluation. Updates evaluation logic to support multiple runs for pass@1, adjusts report generation to highlight these modes, and refactors main() to use the new defaults when no approaches are specified.
---
 scripts/eval_optillmbench.py | 287 ++++++++++++++++++++++++++---------
 1 file changed, 218 insertions(+), 69 deletions(-)

diff --git a/scripts/eval_optillmbench.py b/scripts/eval_optillmbench.py
index 58eac413..a883e5c9 100644
--- a/scripts/eval_optillmbench.py
+++ b/scripts/eval_optillmbench.py
@@ -67,6 +67,13 @@
     ("majority_voting_9", "Majority Voting with k=9", {"k": 9}),
 ]
 
+# Default test-time compute configuration for standard evaluation
+DEFAULT_TEST_TIME_COMPUTE = [
+    ("pass@1", "Baseline with 64 runs averaged", {"num_runs": 64}),
+    ("maj@64", "Majority Voting with k=64", {"approach": "majority_voting", "k": 64}),
+    ("genselect@64", "GenSelect with 64 candidates", {"approach": "genselect", "num_candidates": 64})
+]
+
 def load_optillm_bench() -> datasets.Dataset:
     """Load the OptiLLM Bench dataset."""
     try:
@@ -318,67 +325,96 @@ def evaluate_model(
     # Prepare the dataset
     examples = dataset if max_samples is None else dataset.select(range(max_samples))
     
+    # Check if we need to do multiple runs (for pass@1 calculation)
+    num_runs = approach_extra_body.get("num_runs", 1) if approach_extra_body else 1
+    
+    # Handle special approach names
+    actual_approach = approach
+    if approach == "pass@1":
+        actual_approach = "none"
+    elif approach == "maj@64":
+        actual_approach = "majority_voting"
+    elif approach == "genselect@64":
+        actual_approach = "genselect"
+    elif approach_extra_body and "approach" in approach_extra_body:
+        actual_approach = approach_extra_body["approach"]
+    
     # Create model name with approach - handle special cases
-    if approach == "none":
+    if actual_approach == "none":
         full_model_name = model
-    elif approach.startswith("thinkdeeper_"):
+    elif actual_approach.startswith("thinkdeeper_"):
         # For thinkdeeper, use base model name (decoding is passed in extra_body)
         full_model_name = model
-    elif approach.startswith("majority_voting_"):
+    elif actual_approach.startswith("majority_voting"):
         # For majority voting, use majority_voting prefix
         full_model_name = f"majority_voting-{model}"
     else:
         # Standard approach prefix
-        full_model_name = f"{approach}-{model}"
+        full_model_name = f"{actual_approach}-{model}"
     
     for example in tqdm(examples, desc=f"Evaluating {approach}"):
-        try:
-            # Get appropriate prompt for the category
-            prompt = get_prompt_for_category(example['question'], example['category'])
-            
-            # Record start time
-            start_time = time.time()
-            
-            # Prepare extra_body parameters
-            extra_body = {"spl_learning": False}
-            if approach_extra_body:
-                extra_body.update(approach_extra_body)
-            
-            # Make API call
-            response = client.chat.completions.create(
-                model=full_model_name,
-                messages=[
-                    {"role": "system", "content": "You are a helpful AI assistant focused on providing precise answers in the requested format."},
-                    {"role": "user", "content": prompt}
-                ],
-                temperature=0.2,
-                max_tokens=4096,
-                extra_body=extra_body,
-            )
-            
-            # Calculate time taken
-            time_taken = time.time() - start_time
+        # For pass@1, we need to run multiple times and calculate average
+        if num_runs > 1:
+            run_results = []
+            total_run_time = 0
             
-            # Get the response text
-            response_text = response.choices[0].message.content
+            for run_idx in range(num_runs):
+                try:
+                    # Get appropriate prompt for the category
+                    prompt = get_prompt_for_category(example['question'], example['category'])
+                    
+                    # Record start time
+                    start_time = time.time()
+                    
+                    # Prepare extra_body parameters (excluding num_runs)
+                    extra_body = {"spl_learning": False}
+                    if approach_extra_body:
+                        extra_body_clean = {k: v for k, v in approach_extra_body.items() if k != "num_runs"}
+                        extra_body.update(extra_body_clean)
+                    
+                    # Make API call
+                    response = client.chat.completions.create(
+                        model=full_model_name,
+                        messages=[
+                            {"role": "system", "content": "You are a helpful AI assistant focused on providing precise answers in the requested format."},
+                            {"role": "user", "content": prompt}
+                        ],
+                        temperature=0.7,  # Higher temperature for pass@k diversity
+                        max_tokens=4096,
+                        extra_body=extra_body,
+                    )
+                    
+                    # Calculate time taken
+                    time_taken = time.time() - start_time
+                    total_run_time += time_taken
+                    
+                    # Get the response text
+                    response_text = response.choices[0].message.content
+                    
+                    # Process the response to remove thinking blocks
+                    processed_response = remove_thinking_blocks(response_text)
+                    
+                    # Evaluate the processed response
+                    is_correct = evaluate_response(
+                        processed_response,
+                        example['answer'],
+                        example['category'],
+                        example['question']
+                    )
+                    
+                    run_results.append(is_correct)
+                    
+                except Exception as e:
+                    logger.error(f"Error in run {run_idx+1} for example {example['id']}: {e}")
+                    run_results.append(False)
             
-            # Also store the raw response for reference
-            raw_response = response_text
+            # Calculate average success rate for this example
+            success_rate = sum(run_results) / len(run_results) if run_results else 0
+            avg_time = total_run_time / len(run_results) if run_results else 0
             
-            # Process the response to remove thinking blocks
-            processed_response = remove_thinking_blocks(response_text)
-            
-            # Evaluate the processed response
-            is_correct = evaluate_response(
-                processed_response,
-                example['answer'],
-                example['category'],
-                example['question']  # Pass the question for MMLU evaluation
-            )
-            
-            # Update metrics
-            metrics["total_correct"] += int(is_correct)
-            metrics["total_time"] += time_taken
+            # Update metrics with average
+            metrics["total_correct"] += success_rate
+            metrics["total_time"] += avg_time
             metrics["samples"] += 1
             
             # Update category metrics
@@ -388,28 +424,101 @@ def evaluate_model(
                     "total": 0,
                     "time": 0
                 }
-            category_metrics[example['category']]["correct"] += int(is_correct)
+            category_metrics[example['category']]["correct"] += success_rate
             category_metrics[example['category']]["total"] += 1
-            category_metrics[example['category']]["time"] += time_taken
-            
-            # Check if thinking blocks were removed
-            has_thinking = '</think>' in raw_response
+            category_metrics[example['category']]["time"] += avg_time
             
             # Record detailed result
             detailed_results.append({
                 "id": example['id'],
                 "category": example['category'],
-                "correct": is_correct,
-                "time_taken": time_taken,
-                "raw_response": raw_response,
-                "processed_response": processed_response if has_thinking else None,
-                "has_thinking": has_thinking,
+                "correct": success_rate,  # Store success rate instead of boolean
+                "num_runs": num_runs,
+                "successes": sum(run_results),
+                "time_taken": avg_time,
                 "ground_truth": example['answer']
             })
             
-        except Exception as e:
-            logger.error(f"Error processing example {example['id']}: {e}")
-            continue
+        else:
+            # Single run (original logic)
+            try:
+                # Get appropriate prompt for the category
+                prompt = get_prompt_for_category(example['question'], example['category'])
+                
+                # Record start time
+                start_time = time.time()
+                
+                # Prepare extra_body parameters
+                extra_body = {"spl_learning": False}
+                if approach_extra_body:
+                    extra_body.update(approach_extra_body)
+                
+                # Make API call
+                response = client.chat.completions.create(
+                    model=full_model_name,
+                    messages=[
+                        {"role": "system", "content": "You are a helpful AI assistant focused on providing precise answers in the requested format."},
+                        {"role": "user", "content": prompt}
+                    ],
+                    temperature=0.2,
+                    max_tokens=4096,
+                    extra_body=extra_body,
+                )
+                
+                # Calculate time taken
+                time_taken = time.time() - start_time
+                
+                # Get the response text
+                response_text = response.choices[0].message.content
+                
+                # Also store the raw response for reference
+                raw_response = response_text
+                
+                # Process the response to remove thinking blocks
+                processed_response = remove_thinking_blocks(response_text)
+                
+                # Evaluate the processed response
+                is_correct = evaluate_response(
+                    processed_response,
+                    example['answer'],
+                    example['category'],
+                    example['question']  # Pass the question for MMLU evaluation
+                )
+                
+                # Update metrics
+                metrics["total_correct"] += int(is_correct)
+                metrics["total_time"] += time_taken
+                metrics["samples"] += 1
+                
+                # Update category metrics
+                if example['category'] not in category_metrics:
+                    category_metrics[example['category']] = {
+                        "correct": 0,
+                        "total": 0,
+                        "time": 0
+                    }
+                category_metrics[example['category']]["correct"] += int(is_correct)
+                category_metrics[example['category']]["total"] += 1
+                category_metrics[example['category']]["time"] += time_taken
+                
+                # Check if thinking blocks were removed
+                has_thinking = '</think>' in raw_response
+                
+                # Record detailed result
+                detailed_results.append({
+                    "id": example['id'],
+                    "category": example['category'],
+                    "correct": is_correct,
+                    "time_taken": time_taken,
+                    "raw_response": raw_response,
+                    "processed_response": processed_response if has_thinking else None,
+                    "has_thinking": has_thinking,
+                    "ground_truth": example['answer']
+                })
+                
+            except Exception as e:
+                logger.error(f"Error processing example {example['id']}: {e}")
+                continue
     
     # Calculate final metrics
     final_metrics = {
@@ -458,12 +567,27 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, i
     """Generate a comprehensive report comparing all approaches."""
     report = []
     
+    # Check if this is the default test-time compute evaluation
+    is_default_test_time = set(all_metrics.keys()) == {"pass@1", "maj@64", "genselect@64"}
+    
     # Header
-    report_title = "OptiLLM Bench Test-Time Compute Evaluation Report" if is_test_time_compute else "OptiLLM Bench Evaluation Report"
+    if is_default_test_time:
+        report_title = "OptiLLM Bench Test-Time Compute Evaluation Report"
+    elif is_test_time_compute:
+        report_title = "OptiLLM Bench Test-Time Compute Scaling Report"
+    else:
+        report_title = "OptiLLM Bench Evaluation Report"
+    
     report.append(f"# {report_title}")
     report.append(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
     
-    if is_test_time_compute:
+    if is_default_test_time:
+        report.append("## Test-Time Compute Evaluation Results\n")
+        report.append("This report evaluates the potential of test-time compute with:")
+        report.append("- **pass@1**: Baseline averaged over 64 runs (measures consistency)")
+        report.append("- **maj@64**: Majority voting with 64 candidates")
+        report.append("- **genselect@64**: Generative selection with 64 candidates\n")
+    elif is_test_time_compute:
         report.append("This report evaluates test-time compute scaling approaches:")
         report.append("- **Sequential scaling**: ThinkDeeper with varying thinking token budgets")
         report.append("- **Parallel scaling**: Majority voting with varying k values\n")
@@ -505,6 +629,28 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, i
         df = pd.DataFrame(rows, columns=headers)
         report.append(df.to_markdown())
     
+    # Add summary section for default test-time compute
+    if is_default_test_time:
+        report.append("\n## Summary")
+        if "pass@1" in all_metrics and "maj@64" in all_metrics and "genselect@64" in all_metrics:
+            pass1_acc = all_metrics["pass@1"]["accuracy"] * 100
+            maj64_acc = all_metrics["maj@64"]["accuracy"] * 100
+            genselect64_acc = all_metrics["genselect@64"]["accuracy"] * 100
+            
+            report.append(f"\n**Key Metrics:**")
+            report.append(f"- **pass@1** (baseline averaged over 64 runs): {pass1_acc:.2f}%")
+            report.append(f"- **maj@64** (majority voting with 64 candidates): {maj64_acc:.2f}%")
+            report.append(f"- **genselect@64** (quality-based selection from 64 candidates): {genselect64_acc:.2f}%")
+            
+            # Calculate improvements
+            if pass1_acc > 0:
+                maj_improvement = ((maj64_acc - pass1_acc) / pass1_acc) * 100
+                genselect_improvement = ((genselect64_acc - pass1_acc) / pass1_acc) * 100
+                
+                report.append(f"\n**Improvements over pass@1:**")
+                report.append(f"- maj@64: {'+' if maj_improvement > 0 else ''}{maj_improvement:.1f}%")
+                report.append(f"- genselect@64: {'+' if genselect_improvement > 0 else ''}{genselect_improvement:.1f}%")
+    
     # Save report
     report_path = f"{output_dir}/evaluation_report.md"
     with open(report_path, "w") as f:
@@ -555,12 +701,13 @@ def main():
         if args.approaches:
             # Filter test-time compute approaches if specific ones are requested
             approaches_config = [a for a in TEST_TIME_COMPUTE_APPROACHES if a[0] in args.approaches]
+    elif args.approaches:
+        # Specific approaches requested
+        approaches_config = [a for a in APPROACHES if a[0] in args.approaches]
     else:
-        # Use standard approaches
-        if args.approaches:
-            approaches_config = [a for a in APPROACHES if a[0] in args.approaches]
-        else:
-            approaches_config = APPROACHES
+        # Default: Use the default test-time compute configuration
+        approaches_config = DEFAULT_TEST_TIME_COMPUTE
+        logger.info("Using default test-time compute evaluation (pass@1, maj@64, genselect@64)")
     
     # Store all metrics for final report
     all_metrics = {}
@@ -596,7 +743,9 @@ def main():
             continue
     
     # Generate final report
-    generate_report(all_metrics, args.output_dir, args.test_time_compute)
+    # Determine if we're using default test-time compute or explicit test-time compute
+    is_test_time = args.test_time_compute or (not args.approaches and approaches_config == DEFAULT_TEST_TIME_COMPUTE)
+    generate_report(all_metrics, args.output_dir, is_test_time)
 
 if __name__ == "__main__":
     main()
\ No newline at end of file

From 0dc0ed6b21b975fdc1c9aefd1a4eeedddc4be05c Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sat, 19 Jul 2025 17:54:06 +0800
Subject: [PATCH 03/10] Update eval_optillmbench.py

---
 scripts/eval_optillmbench.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/eval_optillmbench.py b/scripts/eval_optillmbench.py
index a883e5c9..6d971376 100644
--- a/scripts/eval_optillmbench.py
+++ b/scripts/eval_optillmbench.py
@@ -659,7 +659,9 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, i
     logger.info(f"Report saved to {report_path}")
 
 def main():
-    parser = argparse.ArgumentParser(description="Evaluate a model on OptiLLM Bench")
+    parser = argparse.ArgumentParser(
+        description="Evaluate a model on OptiLLM Bench. By default, runs test-time compute evaluation with pass@1, maj@64, and genselect@64."
+    )
     parser.add_argument("--model", required=True, help="Model identifier")
     parser.add_argument("--base-url", default="http://localhost:8000/v1", 
                         help="Base URL for API endpoint")
@@ -667,9 +669,9 @@ def main():
     parser.add_argument("--output-dir", default="results", 
                         help="Directory to save results")
     parser.add_argument("--approaches", nargs="+", 
-                        help="Specific approaches to evaluate (default: all)")
+                        help="Specific approaches to evaluate (overrides default test-time compute)")
     parser.add_argument("--test-time-compute", action="store_true",
-                        help="Evaluate test-time compute approaches (sequential and parallel scaling)")
+                        help="Evaluate full test-time compute scaling approaches (ThinkDeeper and various k values)")
     parser.add_argument("--debug", action="store_true", help="Enable debug logging")
     args = parser.parse_args()
     

From 1daa2a00da5270cf2eb2bc9531e6c20d866ddf62 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Sun, 20 Jul 2025 00:17:16 +0800
Subject: [PATCH 04/10] Relax upper bound for num_candidates validation

Changed num_candidates validation to only enforce a minimum of 2, removing the previous maximum limit of 16. This allows for generating more than 16 candidates if desired.
---
 optillm/plugins/genselect_plugin.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optillm/plugins/genselect_plugin.py b/optillm/plugins/genselect_plugin.py
index 753f927e..f78aac3d 100644
--- a/optillm/plugins/genselect_plugin.py
+++ b/optillm/plugins/genselect_plugin.py
@@ -165,8 +165,8 @@ def run(
     include_reasoning = config.get('include_reasoning', DEFAULT_INCLUDE_REASONING)
     max_tokens = config.get('max_tokens', 4096)
     
-    # Validate num_candidates is in reasonable range (2-16 as per paper)
-    num_candidates = max(2, min(16, num_candidates))
+    # Validate num_candidates is at least 2
+    num_candidates = max(2, num_candidates)
     
     logger.info(f"Generating {num_candidates} candidates with temperature={temperature}")
     

From 3c8ce022cdc973cd94c649bbe615dfaf10a58134 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Tue, 22 Jul 2025 11:31:46 +0800
Subject: [PATCH 05/10] Add comprehensive test suite and CI workflow

Introduces a new tests/ directory with unit and integration tests, a requirements file, and a test runner script. Adds a GitHub Actions workflow for automated testing on multiple Python versions and pull requests. Updates README with detailed testing instructions. Refactors optillm.py and eval_optillmbench.py to improve n parameter handling and test-time compute evaluation logic.
---
 .github/workflows/test.yml               |  86 +++++++
 .gitignore                               |   1 +
 README.md                                |  40 +++
 optillm.py                               |  17 +-
 scripts/eval_optillmbench.py             | 303 +++++++++++++++++++----
 test_results.json                        | 128 ----------
 tests/README.md                          | 107 ++++++++
 tests/__init__.py                        |   1 +
 tests/requirements.txt                   |   4 +
 tests/run_tests.sh                       |  62 +++++
 test.py => tests/test.py                 |  35 ++-
 tests/test_api_compatibility.py          | 133 ++++++++++
 tests/test_approaches.py                 | 128 ++++++++++
 test_cases.json => tests/test_cases.json |   5 +
 tests/test_ci_quick.py                   |  64 +++++
 tests/test_n_parameter.py                |  86 +++++++
 tests/test_plugins.py                    | 107 ++++++++
 17 files changed, 1117 insertions(+), 190 deletions(-)
 create mode 100644 .github/workflows/test.yml
 delete mode 100644 test_results.json
 create mode 100644 tests/README.md
 create mode 100644 tests/__init__.py
 create mode 100644 tests/requirements.txt
 create mode 100755 tests/run_tests.sh
 rename test.py => tests/test.py (79%)
 create mode 100644 tests/test_api_compatibility.py
 create mode 100644 tests/test_approaches.py
 rename test_cases.json => tests/test_cases.json (94%)
 create mode 100644 tests/test_ci_quick.py
 create mode 100755 tests/test_n_parameter.py
 create mode 100644 tests/test_plugins.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 00000000..1e967441
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,86 @@
+name: Run Tests
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.10', '3.11', '3.12']
+    
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+    
+    - name: Cache pip packages
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }}
+        restore-keys: |
+          ${{ runner.os }}-pip-
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install -r tests/requirements.txt
+    
+    - name: Run unit tests
+      run: |
+        # Run quick CI tests
+        python tests/test_ci_quick.py
+        
+        # Run plugin tests with pytest if available
+        python -m pytest tests/test_plugins.py -v --tb=short || python tests/test_plugins.py
+        
+        # Run approach tests
+        python tests/test_approaches.py
+
+  integration-test:
+    runs-on: ubuntu-latest
+    needs: test
+    if: github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository
+    # Only run integration tests on PRs from the same repository (not forks)
+    # This ensures secrets are available
+    
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.11'
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+    
+    - name: Run integration test with OpenAI
+      if: env.OPENAI_API_KEY != ''
+      run: |
+        # Start OptILLM server
+        python optillm.py &
+        SERVER_PID=$!
+        
+        # Wait for server
+        sleep 5
+        
+        # Run simple integration test
+        python tests/test.py --approaches none --single-test "Simple Math Problem" --base-url http://localhost:8000/v1 --model gpt-4o-mini || true
+        
+        # Stop server
+        kill $SERVER_PID || true
+      env:
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+      continue-on-error: true
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 70e8202d..01215d5b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -170,3 +170,4 @@ cython_debug/
 
 scripts/results/
 results/
+test_results.json
diff --git a/README.md b/README.md
index 06769407..3f05f324 100644
--- a/README.md
+++ b/README.md
@@ -565,6 +565,46 @@ called patchflows. We saw huge performance gains across all the supported patchf
 
 ![Results showing optillm mixture of agents approach used with patchflows](https://raw.githubusercontent.com/codelion/optillm/main/moa-patchwork-results.png)
 
+## Testing
+
+OptILLM includes a comprehensive test suite to ensure reliability and compatibility.
+
+### Running Tests
+
+The main test suite can be run from the project root:
+```bash
+# Test all approaches with default test cases
+python tests/test.py
+
+# Test specific approaches
+python tests/test.py --approaches moa bon mcts
+
+# Run a single test
+python tests/test.py --single-test "Simple Math Problem"
+```
+
+### Unit and Integration Tests
+
+Additional tests are available in the `tests/` directory:
+```bash
+# Run all tests (requires pytest)
+./tests/run_tests.sh
+
+# Run specific test modules
+pytest tests/test_plugins.py -v
+pytest tests/test_api_compatibility.py -v
+```
+
+### CI/CD
+
+All tests are automatically run on pull requests via GitHub Actions. The workflow tests:
+- Multiple Python versions (3.10, 3.11, 3.12)
+- Unit tests for plugins and core functionality
+- API compatibility tests
+- Integration tests with various approaches
+
+See `tests/README.md` for more details on the test structure and how to write new tests.
+
 ## References
 - [Eliciting Fine-Tuned Transformer Capabilities via Inference-Time Techniques](https://arxiv.org/abs/2506.08060)
 - [AutoThink: efficient inference for reasoning LLMs](https://dx.doi.org/10.2139/ssrn.5253327) - [Implementation](optillm/autothink)
diff --git a/optillm.py b/optillm.py
index ef421160..32d28c4b 100644
--- a/optillm.py
+++ b/optillm.py
@@ -302,9 +302,9 @@ def execute_single_approach(approach, system_prompt, initial_query, client, mode
             if hasattr(request, 'json'):
                 data = request.get_json()
                 messages = data.get('messages', [])
-                # Copy all parameters except 'stream', 'model' , 'n' and 'messages'
+                # Copy all parameters except 'stream', 'model' and 'messages'
                 kwargs = {k: v for k, v in data.items() 
-                         if k not in ['model', 'messages', 'stream', 'n', 'optillm_approach']}
+                         if k not in ['model', 'messages', 'stream', 'optillm_approach']}
             response = none_approach(original_messages=messages, client=client, model=model, **kwargs)
             # For none approach, we return the response and a token count of 0
             # since the full token count is already in the response
@@ -641,17 +641,8 @@ def proxy():
         contains_none = any(approach == 'none' for approach in approaches)
 
         if operation == 'SINGLE' and approaches[0] == 'none':
-            # For none approach with n>1, make n separate calls
-            if n > 1:
-                responses = []
-                completion_tokens = 0
-                for _ in range(n):
-                    result, tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model, request_config)
-                    responses.append(result)
-                    completion_tokens += tokens
-                result = responses
-            else:
-                result, completion_tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model, request_config)
+            # Pass through the request including the n parameter
+            result, completion_tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model, request_config)
             
             logger.debug(f'Direct proxy response: {result}')
 
diff --git a/scripts/eval_optillmbench.py b/scripts/eval_optillmbench.py
index 6d971376..1ad701f5 100644
--- a/scripts/eval_optillmbench.py
+++ b/scripts/eval_optillmbench.py
@@ -68,10 +68,12 @@
 ]
 
 # Default test-time compute configuration for standard evaluation
+# Using n=8 for all approaches to ensure fair comparison and memory efficiency
 DEFAULT_TEST_TIME_COMPUTE = [
-    ("pass@1", "Baseline with 64 runs averaged", {"num_runs": 64}),
-    ("maj@64", "Majority Voting with k=64", {"approach": "majority_voting", "k": 64}),
-    ("genselect@64", "GenSelect with 64 candidates", {"approach": "genselect", "num_candidates": 64})
+    ("avg@8", "Average of 8 parallel responses", {"n": 8}),
+    ("pass@8", "Pass@8 - success if any of 8 is correct", {"n": 8}),
+    ("maj@8", "Majority Voting with k=8", {"k": 8}),
+    ("genselect@8", "GenSelect with 8 candidates", {"num_candidates": 8})
 ]
 
 def load_optillm_bench() -> datasets.Dataset:
@@ -327,34 +329,181 @@ def evaluate_model(
     
     # Check if we need to do multiple runs (for pass@1 calculation)
     num_runs = approach_extra_body.get("num_runs", 1) if approach_extra_body else 1
+    # Check if we're using n parameter for parallel generation
+    n_param = approach_extra_body.get("n", 1) if approach_extra_body else 1
     
-    # Handle special approach names
-    actual_approach = approach
-    if approach == "pass@1":
-        actual_approach = "none"
-    elif approach == "maj@64":
-        actual_approach = "majority_voting"
-    elif approach == "genselect@64":
-        actual_approach = "genselect"
-    elif approach_extra_body and "approach" in approach_extra_body:
-        actual_approach = approach_extra_body["approach"]
-    
-    # Create model name with approach - handle special cases
-    if actual_approach == "none":
+    # Handle special approach names and create model names
+    if approach.startswith("avg@") or approach.startswith("pass@"):
+        # For avg@N and pass@N, use base model without any prefix
         full_model_name = model
-    elif actual_approach.startswith("thinkdeeper_"):
+    elif approach.startswith("maj@"):
+        # For majority voting, use the plugin prefix
+        full_model_name = f"majority_voting-{model}"
+    elif approach.startswith("genselect@"):
+        # For genselect, use the plugin prefix  
+        full_model_name = f"genselect-{model}"
+    elif approach.startswith("thinkdeeper_"):
         # For thinkdeeper, use base model name (decoding is passed in extra_body)
         full_model_name = model
-    elif actual_approach.startswith("majority_voting"):
-        # For majority voting, use majority_voting prefix
+    elif approach.startswith("majority_voting"):
+        # For other majority voting configurations
         full_model_name = f"majority_voting-{model}"
+    elif approach == "none":
+        # For explicit none approach
+        full_model_name = model
     else:
         # Standard approach prefix
-        full_model_name = f"{actual_approach}-{model}"
+        full_model_name = f"{approach}-{model}"
     
     for example in tqdm(examples, desc=f"Evaluating {approach}"):
+        # For avg@N and pass@N with n parameter, we generate n responses in parallel
+        if n_param > 1 and (approach.startswith("avg@") or approach.startswith("pass@")):
+            try:
+                # Get appropriate prompt for the category
+                prompt = get_prompt_for_category(example['question'], example['category'])
+                
+                # Record start time
+                start_time = time.time()
+                
+                # Prepare extra_body parameters (excluding n)
+                extra_body = {"spl_learning": False}
+                if approach_extra_body:
+                    extra_body_clean = {k: v for k, v in approach_extra_body.items() if k not in ["n", "approach"]}
+                    extra_body.update(extra_body_clean)
+                
+                # Generate n responses - optillm handles n parameter properly
+                responses = []
+                try:
+                    # Make API call with n parameter
+                    response = client.chat.completions.create(
+                        model=full_model_name,
+                        messages=[
+                            {"role": "system", "content": "You are a helpful AI assistant focused on providing precise answers in the requested format."},
+                            {"role": "user", "content": prompt}
+                        ],
+                        n=n_param,
+                        temperature=0.7,  # High temperature for diversity
+                        max_tokens=4096,
+                        extra_body=extra_body,
+                    )
+                    
+                    # Extract responses - optillm returns OpenAI-compatible format
+                    responses = [(choice.message.content, time.time() - start_time) for choice in response.choices]
+                    logger.debug(f"Generated {len(responses)} responses using n={n_param}")
+                    
+                except Exception as e:
+                    # If n parameter fails, fall back to sequential generation
+                    logger.warning(f"Parallel generation failed: {type(e).__name__}: {str(e)}")
+                    logger.info("Falling back to sequential generation")
+                    for i in range(n_param):
+                        try:
+                            single_start = time.time()
+                            response = client.chat.completions.create(
+                                model=full_model_name,
+                                messages=[
+                                    {"role": "system", "content": "You are a helpful AI assistant focused on providing precise answers in the requested format."},
+                                    {"role": "user", "content": prompt}
+                                ],
+                                temperature=0.7,
+                                max_tokens=4096,
+                                extra_body=extra_body,
+                            )
+                            response_text = response.choices[0].message.content
+                            responses.append((response_text, time.time() - single_start))
+                        except Exception as seq_error:
+                            logger.error(f"Sequential generation {i+1}/{n_param} failed: {seq_error}")
+                            responses.append((None, 0))  # Add failed response
+                
+                # Calculate total time
+                time_taken = time.time() - start_time
+                
+                # Evaluate all responses
+                run_results = []
+                for response_text, _ in responses:
+                    if response_text is not None:
+                        processed_response = remove_thinking_blocks(response_text)
+                        is_correct = evaluate_response(
+                            processed_response,
+                            example['answer'],
+                            example['category'],
+                            example['question']
+                        )
+                        run_results.append(is_correct)
+                    else:
+                        run_results.append(False)  # Failed responses count as incorrect
+                
+                # Calculate success rate based on approach
+                if approach.startswith("avg@"):
+                    # Average success rate
+                    success_rate = sum(run_results) / len(run_results) if run_results else 0
+                elif approach.startswith("pass@"):
+                    # Pass@k: success if ANY response is correct
+                    success_rate = 1.0 if any(run_results) else 0.0
+                else:
+                    # Shouldn't reach here, but default to average
+                    success_rate = sum(run_results) / len(run_results) if run_results else 0
+                
+                # Update metrics with average
+                metrics["total_correct"] += success_rate
+                metrics["total_time"] += time_taken
+                metrics["samples"] += 1
+                
+                # Update category metrics
+                if example['category'] not in category_metrics:
+                    category_metrics[example['category']] = {
+                        "correct": 0,
+                        "total": 0,
+                        "time": 0
+                    }
+                category_metrics[example['category']]["correct"] += success_rate
+                category_metrics[example['category']]["total"] += 1
+                category_metrics[example['category']]["time"] += time_taken
+                
+                # Record detailed result
+                detailed_results.append({
+                    "id": example['id'],
+                    "category": example['category'],
+                    "correct": success_rate,  # Store success rate
+                    "n_param": n_param,
+                    "successes": sum(run_results),
+                    "time_taken": time_taken,
+                    "ground_truth": example['answer']
+                })
+                
+            except Exception as e:
+                logger.error(f"Error processing example {example['id']}: {e}")
+                # Count failed examples as incorrect
+                metrics["total_correct"] += 0
+                metrics["total_time"] += 0
+                metrics["samples"] += 1
+                
+                # Update category metrics for failed example
+                if example['category'] not in category_metrics:
+                    category_metrics[example['category']] = {
+                        "correct": 0,
+                        "total": 0,
+                        "time": 0
+                    }
+                category_metrics[example['category']]["correct"] += 0
+                category_metrics[example['category']]["total"] += 1
+                category_metrics[example['category']]["time"] += 0
+                
+                # Record detailed result for failed example
+                detailed_results.append({
+                    "id": example['id'],
+                    "category": example['category'],
+                    "correct": False,
+                    "time_taken": 0,
+                    "raw_response": f"ERROR: {str(e)}",
+                    "processed_response": None,
+                    "has_thinking": False,
+                    "ground_truth": example['answer'],
+                    "error": str(e)
+                })
+                continue
+                
         # For pass@1, we need to run multiple times and calculate average
-        if num_runs > 1:
+        elif num_runs > 1:
             run_results = []
             total_run_time = 0
             
@@ -366,10 +515,10 @@ def evaluate_model(
                     # Record start time
                     start_time = time.time()
                     
-                    # Prepare extra_body parameters (excluding num_runs)
+                    # Prepare extra_body parameters (excluding num_runs and approach)
                     extra_body = {"spl_learning": False}
                     if approach_extra_body:
-                        extra_body_clean = {k: v for k, v in approach_extra_body.items() if k != "num_runs"}
+                        extra_body_clean = {k: v for k, v in approach_extra_body.items() if k not in ["num_runs", "approach"]}
                         extra_body.update(extra_body_clean)
                     
                     # Make API call
@@ -448,10 +597,11 @@ def evaluate_model(
                 # Record start time
                 start_time = time.time()
                 
-                # Prepare extra_body parameters
+                # Prepare extra_body parameters (excluding approach)
                 extra_body = {"spl_learning": False}
                 if approach_extra_body:
-                    extra_body.update(approach_extra_body)
+                    extra_body_clean = {k: v for k, v in approach_extra_body.items() if k != "approach"}
+                    extra_body.update(extra_body_clean)
                 
                 # Make API call
                 response = client.chat.completions.create(
@@ -518,6 +668,34 @@ def evaluate_model(
                 
             except Exception as e:
                 logger.error(f"Error processing example {example['id']}: {e}")
+                # Count failed examples as incorrect
+                metrics["total_correct"] += 0  # Failed = incorrect
+                metrics["total_time"] += 0     # No time recorded for failed attempts
+                metrics["samples"] += 1
+                
+                # Update category metrics for failed example
+                if example['category'] not in category_metrics:
+                    category_metrics[example['category']] = {
+                        "correct": 0,
+                        "total": 0,
+                        "time": 0
+                    }
+                category_metrics[example['category']]["correct"] += 0  # Failed = incorrect
+                category_metrics[example['category']]["total"] += 1
+                category_metrics[example['category']]["time"] += 0
+                
+                # Record detailed result for failed example
+                detailed_results.append({
+                    "id": example['id'],
+                    "category": example['category'],
+                    "correct": False,
+                    "time_taken": 0,
+                    "raw_response": f"ERROR: {str(e)}",
+                    "processed_response": None,
+                    "has_thinking": False,
+                    "ground_truth": example['answer'],
+                    "error": str(e)
+                })
                 continue
     
     # Calculate final metrics
@@ -528,6 +706,13 @@ def evaluate_model(
         "total_samples": metrics["samples"],
     }
     
+    # Log summary of failures if any
+    total_expected = len(examples)
+    failures = len([r for r in detailed_results if "error" in r])
+    if failures > 0:
+        logger.warning(f"Approach {approach}: {failures}/{total_expected} examples failed due to errors")
+        logger.warning(f"Failed examples are counted as incorrect in accuracy calculation")
+    
     # Add category-specific metrics
     for category, cat_metrics in category_metrics.items():
         final_metrics[f"{category}_accuracy"] = cat_metrics["correct"] / cat_metrics["total"]
@@ -568,7 +753,7 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, i
     report = []
     
     # Check if this is the default test-time compute evaluation
-    is_default_test_time = set(all_metrics.keys()) == {"pass@1", "maj@64", "genselect@64"}
+    is_default_test_time = set(all_metrics.keys()) == {"avg@8", "pass@8", "maj@8", "genselect@8"}
     
     # Header
     if is_default_test_time:
@@ -584,9 +769,11 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, i
     if is_default_test_time:
         report.append("## Test-Time Compute Evaluation Results\n")
         report.append("This report evaluates the potential of test-time compute with:")
-        report.append("- **pass@1**: Baseline averaged over 64 runs (measures consistency)")
-        report.append("- **maj@64**: Majority voting with 64 candidates")
-        report.append("- **genselect@64**: Generative selection with 64 candidates\n")
+        report.append("- **avg@8**: Average success rate of 8 parallel responses")
+        report.append("- **pass@8**: Success if ANY of 8 responses is correct")
+        report.append("- **maj@8**: Majority voting with 8 candidates")
+        report.append("- **genselect@8**: Quality-based selection from 8 candidates\n")
+        report.append("All approaches use n=8 parallel generation (with sequential fallback) for fair comparison.\n")
     elif is_test_time_compute:
         report.append("This report evaluates test-time compute scaling approaches:")
         report.append("- **Sequential scaling**: ThinkDeeper with varying thinking token budgets")
@@ -632,24 +819,35 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, i
     # Add summary section for default test-time compute
     if is_default_test_time:
         report.append("\n## Summary")
-        if "pass@1" in all_metrics and "maj@64" in all_metrics and "genselect@64" in all_metrics:
-            pass1_acc = all_metrics["pass@1"]["accuracy"] * 100
-            maj64_acc = all_metrics["maj@64"]["accuracy"] * 100
-            genselect64_acc = all_metrics["genselect@64"]["accuracy"] * 100
+        if all(metric in all_metrics for metric in ["avg@8", "pass@8", "maj@8", "genselect@8"]):
+            avg8_acc = all_metrics["avg@8"]["accuracy"] * 100
+            pass8_acc = all_metrics["pass@8"]["accuracy"] * 100
+            maj8_acc = all_metrics["maj@8"]["accuracy"] * 100
+            genselect8_acc = all_metrics["genselect@8"]["accuracy"] * 100
             
             report.append(f"\n**Key Metrics:**")
-            report.append(f"- **pass@1** (baseline averaged over 64 runs): {pass1_acc:.2f}%")
-            report.append(f"- **maj@64** (majority voting with 64 candidates): {maj64_acc:.2f}%")
-            report.append(f"- **genselect@64** (quality-based selection from 64 candidates): {genselect64_acc:.2f}%")
+            report.append(f"- **avg@8** (average of 8 responses): {avg8_acc:.2f}%")
+            report.append(f"- **pass@8** (success if any correct): {pass8_acc:.2f}%")
+            report.append(f"- **maj@8** (majority voting): {maj8_acc:.2f}%")
+            report.append(f"- **genselect@8** (quality-based selection): {genselect8_acc:.2f}%")
             
-            # Calculate improvements
-            if pass1_acc > 0:
-                maj_improvement = ((maj64_acc - pass1_acc) / pass1_acc) * 100
-                genselect_improvement = ((genselect64_acc - pass1_acc) / pass1_acc) * 100
+            # Calculate improvements over baseline (avg@8)
+            if avg8_acc > 0:
+                pass_improvement = ((pass8_acc - avg8_acc) / avg8_acc) * 100
+                maj_improvement = ((maj8_acc - avg8_acc) / avg8_acc) * 100
+                genselect_improvement = ((genselect8_acc - avg8_acc) / avg8_acc) * 100
                 
-                report.append(f"\n**Improvements over pass@1:**")
-                report.append(f"- maj@64: {'+' if maj_improvement > 0 else ''}{maj_improvement:.1f}%")
-                report.append(f"- genselect@64: {'+' if genselect_improvement > 0 else ''}{genselect_improvement:.1f}%")
+                report.append(f"\n**Improvements over avg@8 baseline:**")
+                report.append(f"- pass@8: {'+' if pass_improvement > 0 else ''}{pass_improvement:.1f}%")
+                report.append(f"- maj@8: {'+' if maj_improvement > 0 else ''}{maj_improvement:.1f}%")
+                report.append(f"- genselect@8: {'+' if genselect_improvement > 0 else ''}{genselect_improvement:.1f}%")
+            
+            # Show variance indicator
+            if pass8_acc > avg8_acc:
+                variance_ratio = (pass8_acc - avg8_acc) / avg8_acc * 100
+                report.append(f"\n**Response Variance Indicator:**")
+                report.append(f"- Gap between pass@8 and avg@8: {variance_ratio:.1f}%")
+                report.append(f"- This indicates {'high' if variance_ratio > 50 else 'moderate' if variance_ratio > 20 else 'low'} variance in response quality")
     
     # Save report
     report_path = f"{output_dir}/evaluation_report.md"
@@ -704,12 +902,25 @@ def main():
             # Filter test-time compute approaches if specific ones are requested
             approaches_config = [a for a in TEST_TIME_COMPUTE_APPROACHES if a[0] in args.approaches]
     elif args.approaches:
-        # Specific approaches requested
-        approaches_config = [a for a in APPROACHES if a[0] in args.approaches]
+        # Specific approaches requested - check all available approach lists
+        all_available_approaches = APPROACHES + TEST_TIME_COMPUTE_APPROACHES + DEFAULT_TEST_TIME_COMPUTE
+        approaches_config = []
+        for requested_approach in args.approaches:
+            found = False
+            for approach_tuple in all_available_approaches:
+                if approach_tuple[0] == requested_approach:
+                    if approach_tuple not in approaches_config:  # Avoid duplicates
+                        approaches_config.append(approach_tuple)
+                    found = True
+                    break
+            if not found:
+                logger.warning(f"Approach '{requested_approach}' not found in any configuration")
+        if not approaches_config:
+            raise ValueError(f"No valid approaches found. Requested: {args.approaches}")
     else:
         # Default: Use the default test-time compute configuration
         approaches_config = DEFAULT_TEST_TIME_COMPUTE
-        logger.info("Using default test-time compute evaluation (pass@1, maj@64, genselect@64)")
+        logger.info("Using default test-time compute evaluation (avg@8, pass@8, maj@8, genselect@8)")
     
     # Store all metrics for final report
     all_metrics = {}
diff --git a/test_results.json b/test_results.json
deleted file mode 100644
index 8ad21a82..00000000
--- a/test_results.json
+++ /dev/null
@@ -1,128 +0,0 @@
-[
-  {
-    "test_case": {
-      "name": "Arena Bench Hard",
-      "system_prompt": "",
-      "query": "Write a Python program to build an RL model to recite text from any position that the user provides, using only numpy."
-    },
-    "results": [
-      {
-        "approach": "cepo",
-        "result": [
-          "Certainly! Below is a complete Python program that implements a reinforcement learning (RL) model to recite text from any position specified by the user, using only NumPy. The program includes the necessary classes and functions to create the environment, train the agent, and allow user interaction.\n\n```python\nimport numpy as np\n\nclass TextEnvironment:\n    def __init__(self, text):\n        self.text = text\n        self.state = 0  # Starting position\n        self.done = False\n\n    def reset(self):\n        self.state = 0\n        self.done = False\n        return self.state\n\n    def step(self, action):\n        if action == 0:  # Recite next character\n            if self.state < len(self.text) - 1:\n                self.state += 1\n            else:\n                self.done = True\n        elif action == 1:  # Move backward\n            if self.state > 0:\n                self.state -= 1\n        elif action == 2:  # Stop\n            self.done = True\n        \n        return self.state, self.done\n\nclass QLearningAgent:\n    def __init__(self, state_size, action_size, learning_rate=0.1, discount_factor=0.9):\n        self.q_table = np.zeros((state_size, action_size))\n        self.learning_rate = learning_rate\n        self.discount_factor = discount_factor\n\n    def choose_action(self, state, epsilon):\n        if np.random.rand() < epsilon:  # Explore\n            return np.random.randint(3)  # Random action\n        else:  # Exploit\n            return np.argmax(self.q_table[state])  # Best action\n\n    def update_q_value(self, state, action, reward, next_state):\n        best_next_action = np.argmax(self.q_table[next_state])\n        td_target = reward + self.discount_factor * self.q_table[next_state][best_next_action]\n        self.q_table[state][action] += self.learning_rate * (td_target - self.q_table[state][action])\n\ndef user_interface(text):\n    print(\"Text: \", text)\n    start_position = int(input(\"Enter the starting position (0 to {}): \".format(len(text) - 1)))\n    return start_position\n\ndef train_agent(agent, env, episodes, epsilon):\n    for episode in range(episodes):\n        state = env.reset()\n        total_reward = 0\n        \n        while not env.done:\n            action = agent.choose_action(state, epsilon)\n            next_state, done = env.step(action)\n            reward = 1 if action == 0 else 0  # Reward for reciting the next character\n            agent.update_q_value(state, action, reward, next_state)\n            state = next_state\n            total_reward += reward\n        \n        env.done = False  # Reset done for the next episode\n        print(f\"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}\")\n\ndef test_agent(agent, env, start_position):\n    env.state = start_position\n    while not env.done:\n        action = np.argmax(agent.q_table[env.state])  # Choose the best action\n        next_state, done = env.step(action)\n        if action == 0:  # Recite next character\n            print(env.text[env.state], end='', flush=True)\n        env.state = next_state\n    print()  # New line after reciting\n\nif __name__ == \"__main__\":\n    text = \"Hello, this is a reinforcement learning model to recite text.\"\n    env = TextEnvironment(text)\n    agent = QLearningAgent(state_size=len(text), action_size=3)\n\n    # Train the agent\n    train_agent(agent, env, episodes=1000, epsilon=0.1)\n\n    # User interaction\n    start_position = user_interface(text)\n    print(\"Reciting from position {}: \".format(start_position), end='')\n    test_agent(agent, env, start_position)\n```\n\n### Explanation of the Program\n\n1. **TextEnvironment Class**: This class manages the text and the current position of the agent. It provides methods to reset the environment and to take a step based on the action chosen by the agent.\n\n2. **QLearningAgent Class**: This class implements the Q-learning algorithm. It maintains a Q-table to store the expected rewards for each state-action pair and includes methods for choosing actions and updating Q-values based on the agent's experiences.\n\n3. **User Interface**: The `user_interface` function displays the text and prompts the user to enter a starting position from which they want the agent to recite the text.\n\n4. **Training Function**: The `train_agent` function trains the agent over a specified number of episodes, allowing it to learn the best actions to take in the environment.\n\n5. **Testing Function**: The `test_agent` function allows the agent to recite the text starting from the user-defined position based on the learned Q-values.\n\n6. **Main Execution Block**: The program initializes the text, environment, and agent, trains the agent, and then prompts the user for a starting position to recite the text.\n\n### How to Run the Program\n1. Copy the code into a Python environment (e.g., Jupyter Notebook, Python script).\n2. Run the program. It will train the agent and then ask for a starting position.\n3. Enter a valid starting position (between 0 and the length of the text minus one) to see the agent recite the text from that position.",
-          24709
-        ],
-        "time": 496.36777901649475,
-        "status": "success"
-      }
-    ]
-  },
-  {
-    "test_case": {
-      "name": "Big Code Bench",
-      "system_prompt": "",
-      "query": "Calculates the average of the sums of absolute differences between each pair of consecutive numbers for all permutations of a given list. Each permutation is shuffled before calculating the differences. Args: - numbers (list): A list of numbers. Default is numbers from 1 to 10. The function should output with: float: The average of the sums of absolute differences for each shuffled permutation of the list. You should write self-contained code starting with: ``` import itertools from random import shuffle def task_func(numbers=list(range(1, 3))): ```"
-    },
-    "results": [
-      {
-        "approach": "cepo",
-        "result": [
-          "Certainly! Below is the self-contained code that calculates the average of the sums of absolute differences between each pair of consecutive numbers for all permutations of a given list. Each permutation is shuffled before calculating the differences, as per your request.\n\n```python\nimport itertools\nfrom random import shuffle\n\ndef task_func(numbers=list(range(1, 11))):  # Default is numbers from 1 to 10\n    all_permutations = list(itertools.permutations(numbers))  # Generate all permutations\n    total_sum = 0  # Initialize total sum of absolute differences\n    num_permutations = len(all_permutations)  # Count of permutations\n\n    for perm in all_permutations:  # Iterate through each permutation\n        shuffled_perm = list(perm)  # Convert to list to shuffle\n        shuffle(shuffled_perm)  # Shuffle the current permutation\n        \n        # Calculate the sum of absolute differences for the shuffled permutation\n        abs_diff_sum = sum(abs(shuffled_perm[i] - shuffled_perm[i + 1]) for i in range(len(shuffled_perm) - 1))\n        total_sum += abs_diff_sum  # Accumulate the sum\n\n    # Calculate the average of the sums of absolute differences\n    average_difference = total_sum / num_permutations if num_permutations > 0 else 0\n    \n    return float(average_difference)  # Return the average as a float\n\n# Example usage\nresult = task_func()\nprint(result)\n```\n\n### Explanation of the Code:\n1. **Imports**: We import `itertools` for generating permutations and `shuffle` from `random` to shuffle the permutations.\n2. **Function Definition**: The function `task_func` takes a list of numbers, defaulting to numbers from 1 to 10.\n3. **Generate Permutations**: We generate all permutations of the input list.\n4. **Initialize Variables**: We initialize `total_sum` to accumulate the sums of absolute differences and `num_permutations` to count the total permutations.\n5. **Iterate Through Permutations**: For each permutation, we convert it to a list and shuffle it.\n6. **Calculate Absolute Differences**: We calculate the sum of absolute differences between consecutive elements in the shuffled permutation.\n7. **Accumulate the Total**: We add the calculated sum to `total_sum`.\n8. **Calculate Average**: After processing all permutations, we calculate the average of the sums of absolute differences.\n9. **Return Result**: Finally, we return the average as a float.\n\nYou can run this code in a Python environment to see the final result.",
-          14906
-        ],
-        "time": 318.84958386421204,
-        "status": "success"
-      }
-    ]
-  },
-  {
-    "test_case": {
-      "name": "Maths Problem",
-      "system_prompt": "You are a helpful AI assistant with expertise in mathematical and logical problem-solving.",
-      "query": "Maximize x + y subject to: x + 2y <= 10, x >= 0, y >= 0"
-    },
-    "results": [
-      {
-        "approach": "cepo",
-        "result": [
-          "To maximize \\( z = x + y \\) subject to the constraints:\n\n1. \\( x + 2y \\leq 10 \\)\n2. \\( x \\geq 0 \\)\n3. \\( y \\geq 0 \\)\n\n### Step 1: Identify the Constraints\nThe constraints define a feasible region in the first quadrant of the Cartesian plane.\n\n### Step 2: Graph the Constraints\nTo graph the line defined by the constraint \\( x + 2y = 10 \\):\n- **Find the x-intercept**: Set \\( y = 0 \\):\n  \\[\n  x + 2(0) = 10 \\implies x = 10 \\quad \\text{(Point: (10, 0))}\n  \\]\n- **Find the y-intercept**: Set \\( x = 0 \\):\n  \\[\n  0 + 2y = 10 \\implies y = 5 \\quad \\text{(Point: (0, 5))}\n  \\]\n\n### Step 3: Determine the Feasible Region\nThe feasible region is bounded by:\n- The line \\( x + 2y = 10 \\)\n- The x-axis (where \\( y = 0 \\))\n- The y-axis (where \\( x = 0 \\))\n\nThe feasible region is the area in the first quadrant below the line segment connecting (10, 0) and (0, 5).\n\n### Step 4: Identify the Corner Points\nThe corner points (vertices) of the feasible region are:\n1. \\( (0, 0) \\) (intersection of the axes)\n2. \\( (10, 0) \\) (x-intercept of the line)\n3. \\( (0, 5) \\) (y-intercept of the line)\n\n### Step 5: Evaluate the Objective Function at Each Corner Point\nNow we will evaluate \\( z = x + y \\) at each of the corner points:\n\n1. At \\( (0, 0) \\):\n   \\[\n   z = 0 + 0 = 0\n   \\]\n\n2. At \\( (10, 0) \\):\n   \\[\n   z = 10 + 0 = 10\n   \\]\n\n3. At \\( (0, 5) \\):\n   \\[\n   z = 0 + 5 = 5\n   \\]\n\n### Step 6: Determine the Maximum Value\nNow we compare the values obtained:\n- At \\( (0, 0) \\): \\( z = 0 \\)\n- At \\( (10, 0) \\): \\( z = 10 \\)\n- At \\( (0, 5) \\): \\( z = 5 \\)\n\nThe maximum value of \\( z \\) is \\( 10 \\), which occurs at the point \\( (10, 0) \\).\n\n### Final Answer\nThe maximum value of \\( x + y \\) subject to the given constraints is **10**, which occurs at the point \\( (10, 0) \\).",
-          14387
-        ],
-        "time": 341.3320279121399,
-        "status": "success"
-      }
-    ]
-  },
-  {
-    "test_case": {
-      "name": "GSM8K",
-      "system_prompt": "",
-      "query": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?"
-    },
-    "results": [
-      {
-        "approach": "cepo",
-        "result": [
-          "If there are 3 cars in the parking lot and 2 more cars arrive, the total number of cars in the parking lot is:\n\n\\[\n3 + 2 = 5\n\\]\n\nTherefore, there are **5 cars** in the parking lot.",
-          5727
-        ],
-        "time": 139.8697202205658,
-        "status": "success"
-      }
-    ]
-  },
-  {
-    "test_case": {
-      "name": "r/LocalLLaMA",
-      "system_prompt": "",
-      "query": "I have a dish of potatoes. The following statements are true: No potatoes of mine, that are new, have >been boiled. All my potatoes in this dish are fit to eat. No unboiled potatoes of mine are fit to eat. Are there any new potatoes in this dish?"
-    },
-    "results": [
-      {
-        "approach": "cepo",
-        "result": [
-          "Based on the analysis of the statements provided:\n\n1. **No potatoes of mine that are new have been boiled.** (This means new potatoes cannot be boiled.)\n2. **All my potatoes in this dish are fit to eat.** (This indicates that every potato present is edible.)\n3. **No unboiled potatoes of mine are fit to eat.** (This implies that if a potato is unboiled, it cannot be fit to eat.)\n\nFrom these statements, we can conclude the following:\n\n- Since all potatoes in the dish are fit to eat, there cannot be any unboiled potatoes in the dish (as unboiled potatoes are not fit to eat).\n- If there were new potatoes in the dish, they would be unboiled (because new potatoes have not been boiled), which would contradict the fact that all potatoes in the dish are fit to eat.\n\nTherefore, we conclude that there **are no new potatoes in this dish.**",
-          9633
-        ],
-        "time": 229.47916913032532,
-        "status": "success"
-      }
-    ]
-  },
-  {
-    "test_case": {
-      "name": "reddit",
-      "system_prompt": "",
-      "query": "There are 24 volunteers. Over the next 3 weeks, each volunteer is assigned to a different task. There are 8 tasks. Each week, the volunteers switch tasks. Each task has 3 volunteers assigned to it. Volunteers cannot be assigned to the same task more than once, and volunteers cannot share the same task more than once."
-    },
-    "results": [
-      {
-        "approach": "cepo",
-        "result": [
-          "Based on the constraints provided, here is the final assignment schedule for the 24 volunteers over the next 3 weeks, ensuring that each volunteer is assigned to a different task each week, with no repetitions of tasks or sharing of tasks among the same volunteers.\n\n### Final Assignment Schedule\n\n#### Week 1\n| Task | Volunteers       |\n|------|------------------|\n| T1   | V1, V2, V3       |\n| T2   | V4, V5, V6       |\n| T3   | V7, V8, V9       |\n| T4   | V10, V11, V12    |\n| T5   | V13, V14, V15    |\n| T6   | V16, V17, V18    |\n| T7   | V19, V20, V21    |\n| T8   | V22, V23, V24    |\n\n#### Week 2\n| Task | Volunteers       |\n|------|------------------|\n| T1   | V1, V4, V7       |\n| T2   | V2, V5, V8       |\n| T3   | V3, V6, V9       |\n| T4   | V10, V13, V16    |\n| T5   | V11, V14, V17    |\n| T6   | V12, V15, V18    |\n| T7   | V19, V22, V24    |\n| T8   | V20, V21, V23    |\n\n#### Week 3\n| Task | Volunteers       |\n|------|------------------|\n| T1   | V4, V7, V10      |\n| T2   | V5, V8, V11      |\n| T3   | V6, V9, V12      |\n| T4   | V13, V16, V19    |\n| T5   | V14, V17, V20    |\n| T6   | V15, V18, V21    |\n| T7   | V22, V24, V1     |\n| T8   | V23, V2, V3      |\n\n### Summary of Constraints Met\n1. **Unique Task Assignment**: Each volunteer is assigned to a different task each week.\n2. **No Repeated Tasks**: No volunteer is assigned to the same task more than once over the 3 weeks.\n3. **No Shared Tasks**: No group of volunteers shares the same task more than once.\n\nThis schedule effectively meets all the requirements outlined in the question.",
-          23518
-        ],
-        "time": 547.4458949565887,
-        "status": "success"
-      }
-    ]
-  },
-  {
-    "test_case": {
-      "name": "GH",
-      "system_prompt": "",
-      "query": "Find the largest possible real part of[(75+117i)z+\frac{96+144i}{z}]where z is a complex number with |z|=4"
-    },
-    "results": [
-      {
-        "approach": "cepo",
-        "result": [
-          "To find the largest possible real part of the expression \n\n\\[\n(75 + 117i)z + \\frac{96 + 144i}{z}\n\\]\n\nwhere \\( z \\) is a complex number with \\( |z| = 4 \\), we can follow the steps outlined previously.\n\n### Step 1: Express \\( z \\) in polar form\nSince \\( |z| = 4 \\), we can express \\( z \\) as:\n\\[\nz = 4e^{i\\theta}\n\\]\nwhere \\( \\theta \\) is the argument of \\( z \\).\n\n### Step 2: Substitute \\( z \\) into the expression\nSubstituting \\( z \\) into the expression gives:\n\\[\n(75 + 117i)(4e^{i\\theta}) + \\frac{96 + 144i}{4e^{i\\theta}}.\n\\]\nThis simplifies to:\n\\[\n4(75 + 117i)e^{i\\theta} + \\frac{96 + 144i}{4} e^{-i\\theta}.\n\\]\nCalculating \\(\\frac{96 + 144i}{4}\\):\n\\[\n\\frac{96 + 144i}{4} = 24 + 36i.\n\\]\nThus, the expression becomes:\n\\[\n4(75 + 117i)e^{i\\theta} + (24 + 36i)e^{-i\\theta}.\n\\]\n\n### Step 3: Expand the expression\nNow we can express this as:\n\\[\n= 4(75\\cos \\theta + 117i\\cos \\theta + 75i\\sin \\theta - 117\\sin \\theta) + (24\\cos \\theta + 36i\\cos \\theta - 24i\\sin \\theta - 36\\sin \\theta).\n\\]\nCombining the real and imaginary parts:\n- Real part: \n\\[\n4(75\\cos \\theta - 117\\sin \\theta) + (24\\cos \\theta - 36\\sin \\theta)\n\\]\n- Imaginary part: \n\\[\n4(117\\cos \\theta + 75\\sin \\theta) + 36\\cos \\theta - 24\\sin \\theta\n\\]\n\nThus, the real part \\( R \\) is:\n\\[\nR = 4(75\\cos \\theta - 117\\sin \\theta) + (24\\cos \\theta - 36\\sin \\theta)\n\\]\n\\[\n= (300 + 24)\\cos \\theta + (-468 - 36)\\sin \\theta\n\\]\n\\[\n= 324\\cos \\theta - 504\\sin \\theta.\n\\]\n\n### Step 4: Maximize the real part\nTo maximize \\( R(\\theta) = 324\\cos \\theta - 504\\sin \\theta \\), we can rewrite it in the form:\n\\[\nR(\\theta) = A\\cos(\\theta + \\phi)\n\\]\nwhere:\n- \\( A = \\sqrt{(324)^2 + (-504)^2} \\)\n- \\( \\tan \\phi = \\frac{-504}{324} \\)\n\nCalculating \\( A \\):\n\\[\nA = \\sqrt{324^2 + 504^2} = \\sqrt{104976 + 254016} = \\sqrt{358992} = 598.32 \\text{ (approximately)}.\n\\]\n\n### Step 5: Find the maximum value\nThe maximum value of \\( R(\\theta) \\) occurs when \\( \\cos(\\theta + \\phi) = 1 \\):\n\\[\n\\text{Maximum } R = A = 598.32.\n\\]\n\n### Conclusion\nThus, the largest possible real part of the expression \\((75 + 117i)z + \\frac{96 + 144i}{z}\\) where \\(|z| = 4\\) is approximately:\n\n\\[\n\\boxed{598.32}.\n\\] \n\nThis is the correct answer to the question.",
-          19637
-        ],
-        "time": 407.97162795066833,
-        "status": "success"
-      }
-    ]
-  }
-]
\ No newline at end of file
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 00000000..ecde79cb
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,107 @@
+# OptILLM Tests
+
+This directory contains tests for the OptILLM project.
+
+## Structure
+
+- `test.py` - Main comprehensive test suite for all OptILLM approaches
+- `test_cases.json` - Test cases for the main test suite
+- `test_plugins.py` - Unit tests for plugin functionality
+- `test_api_compatibility.py` - Tests for OpenAI API compatibility
+- `test_n_parameter.py` - Tests for n parameter functionality (multiple completions)
+- `test_approaches.py` - Unit tests for approach modules (no model inference required)
+- `test_ci_quick.py` - Quick CI tests for imports and basic functionality
+- `run_tests.sh` - Convenience script to run all tests
+- `requirements.txt` - Test dependencies (pytest, etc.)
+
+## Running Tests
+
+### Prerequisites
+
+1. Install test dependencies:
+   ```bash
+   pip install -r tests/requirements.txt
+   ```
+
+2. Start the OptILLM server:
+   ```bash
+   python optillm.py
+   ```
+
+### Run All Tests
+
+```bash
+./tests/run_tests.sh
+```
+
+### Run Specific Tests
+
+```bash
+# Unit tests only (no server required)
+pytest tests/test_plugins.py
+
+# API tests (requires running server)
+pytest tests/test_api_compatibility.py
+
+# N parameter test
+python tests/test_n_parameter.py
+```
+
+### Run with pytest
+
+```bash
+# Run all tests in the tests directory
+pytest tests/ -v
+
+# Run with coverage
+pytest tests/ --cov=optillm --cov-report=html
+```
+
+## Main Test Suite
+
+The main test suite (`test.py`) is located in the tests directory along with its test data (`test_cases.json`).
+
+To run the main test suite from the project root:
+```bash
+python tests/test.py
+```
+
+Or from within the tests directory:
+```bash
+cd tests
+python test.py
+```
+
+## CI/CD
+
+Tests are automatically run on:
+- Every push to the main branch
+- Every pull request
+
+The GitHub Actions workflow (`.github/workflows/test.yml`) runs:
+1. Quick CI tests (imports and basic functionality)
+2. Unit tests for plugins and approaches (no model inference required)
+3. Integration tests with OpenAI API (only on PRs from same repository with secrets)
+
+### CI Testing Strategy
+
+To keep CI fast and reliable:
+- Unit tests don't require model inference or a running server
+- Integration tests only run with real API keys when available
+- The main `test.py` is kept in the root for comprehensive local testing
+- For CI, we use simplified tests that verify structure and imports
+
+## Writing New Tests
+
+1. Add unit tests to appropriate files in `tests/`
+2. Follow pytest conventions (test functions start with `test_`)
+3. Use fixtures for common setup
+4. Add integration tests that require the server to `test_api_compatibility.py`
+
+## Test Coverage
+
+To generate a coverage report:
+```bash
+pytest tests/ --cov=optillm --cov-report=html
+open htmlcov/index.html
+```
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..d2eec39f
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+# Tests for OptILLM
\ No newline at end of file
diff --git a/tests/requirements.txt b/tests/requirements.txt
new file mode 100644
index 00000000..968e4482
--- /dev/null
+++ b/tests/requirements.txt
@@ -0,0 +1,4 @@
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+pytest-timeout>=2.1.0
+pytest-mock>=3.10.0
\ No newline at end of file
diff --git a/tests/run_tests.sh b/tests/run_tests.sh
new file mode 100755
index 00000000..534145af
--- /dev/null
+++ b/tests/run_tests.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+# Run all tests for OptILLM
+
+set -e  # Exit on error
+
+echo "Running OptILLM Tests"
+echo "===================="
+
+# Check if optillm server is running
+check_server() {
+    curl -s http://localhost:8000/v1/health > /dev/null 2>&1
+}
+
+# Colors for output
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Check Python version
+echo "Python version:"
+python --version
+
+# Install test dependencies if needed
+if ! python -c "import pytest" 2>/dev/null; then
+    echo -e "${YELLOW}Installing test dependencies...${NC}"
+    pip install -r tests/requirements.txt
+fi
+
+# Check if optillm server is running
+if ! check_server; then
+    echo -e "${YELLOW}Warning: OptILLM server not detected at localhost:8000${NC}"
+    echo "Some integration tests may fail. Start the server with: python optillm.py"
+    echo ""
+fi
+
+# Run unit tests
+echo -e "\n${GREEN}Running unit tests...${NC}"
+python -m pytest tests/test_plugins.py -v
+
+# Run API tests if server is available
+if check_server; then
+    echo -e "\n${GREEN}Running API compatibility tests...${NC}"
+    python -m pytest tests/test_api_compatibility.py -v
+else
+    echo -e "\n${YELLOW}Skipping API tests (server not running)${NC}"
+fi
+
+# Run n parameter test
+if check_server; then
+    echo -e "\n${GREEN}Running n parameter test...${NC}"
+    python tests/test_n_parameter.py
+else
+    echo -e "\n${YELLOW}Skipping n parameter test (server not running)${NC}"
+fi
+
+# Run main test suite with a simple test
+echo -e "\n${GREEN}Running main test suite (simple test only)...${NC}"
+cd "$(dirname "$0")/.."  # Go to project root
+python tests/test.py --approaches none bon --single-test "Simple Math Problem"
+
+echo -e "\n${GREEN}All tests completed!${NC}"
\ No newline at end of file
diff --git a/test.py b/tests/test.py
similarity index 79%
rename from test.py
rename to tests/test.py
index 188f677b..62989d41 100644
--- a/test.py
+++ b/tests/test.py
@@ -2,11 +2,15 @@
 import json
 import time
 import os
+import sys
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import List, Dict
 import logging
 from openai import OpenAI
 
+# Add parent directory to path to import optillm modules
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
 from optillm.litellm_wrapper import LiteLLMWrapper
 from optillm.mcts import chat_with_mcts
 from optillm.bon import best_of_n_sampling
@@ -61,8 +65,23 @@ def load_test_cases(file_path: str) -> List[Dict]:
 def run_approach(approach_name: str, system_prompt: str, query: str, client, model: str) -> Dict:
     start_time = time.time()
     try:
-        approach_func = APPROACHES[approach_name]
-        result = approach_func(system_prompt, query, client, model)
+        if approach_name == 'none':
+            # Direct pass-through for 'none' approach
+            messages = []
+            if system_prompt:
+                messages.append({"role": "system", "content": system_prompt})
+            messages.append({"role": "user", "content": query})
+            
+            response = client.chat.completions.create(
+                model=model,
+                messages=messages,
+                temperature=0.7
+            )
+            result = (response.choices[0].message.content, response.usage.total_tokens)
+        else:
+            approach_func = APPROACHES[approach_name]
+            result = approach_func(system_prompt, query, client, model)
+        
         end_time = time.time()
         return {
             'approach': approach_name,
@@ -118,12 +137,22 @@ def print_summary(results: List[Dict]):
 
 def main():
     parser = argparse.ArgumentParser(description="Test different LLM inference approaches.")
-    parser.add_argument("--test_cases", type=str, default="test_cases.json", help="Path to test cases JSON file")
+    parser.add_argument("--test_cases", type=str, default=None, help="Path to test cases JSON file")
     parser.add_argument("--approaches", nargs='+', default=list(APPROACHES.keys()), help="Approaches to test")
     parser.add_argument("--model", type=str, default="gpt-4o-mini", help="Model to use for testing")
     parser.add_argument("--base-url", type=str, default=None, help="The base_url for the OpenAI API compatible endpoint")
     parser.add_argument("--single-test", type=str, default=None, help="Name of a single test case to run")
     args = parser.parse_args()
+    
+    # Set default test_cases path relative to this script
+    if args.test_cases is None:
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        args.test_cases = os.path.join(script_dir, "test_cases.json")
+    
+    # If using local inference mode, override model to a local model
+    if os.environ.get("OPTILLM_API_KEY") == "optillm" and args.model == "gpt-4o-mini":
+        args.model = "Qwen/Qwen2.5-0.5B-Instruct"
+        logger.info(f"Using local model: {args.model}")
 
     test_cases = load_test_cases(args.test_cases)
 
diff --git a/tests/test_api_compatibility.py b/tests/test_api_compatibility.py
new file mode 100644
index 00000000..e33d6e92
--- /dev/null
+++ b/tests/test_api_compatibility.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+"""
+Test API compatibility with OpenAI format
+"""
+
+import pytest
+import os
+from openai import OpenAI
+import json
+
+
+@pytest.fixture
+def client():
+    """Create OpenAI client for optillm proxy"""
+    return OpenAI(
+        api_key=os.environ.get("OPENAI_API_KEY", "test-key"),
+        base_url="http://localhost:8000/v1"
+    )
+
+
+def test_basic_completion(client):
+    """Test basic chat completion"""
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Say hello"}
+        ],
+        max_tokens=10
+    )
+    
+    assert hasattr(response, 'choices')
+    assert len(response.choices) > 0
+    assert hasattr(response.choices[0], 'message')
+    assert hasattr(response.choices[0].message, 'content')
+
+
+def test_n_parameter(client):
+    """Test n parameter for multiple completions"""
+    n = 3
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "user", "content": "Write a one-line joke"}
+        ],
+        n=n,
+        temperature=0.8,
+        max_tokens=50
+    )
+    
+    assert len(response.choices) == n
+    # Check all responses are different (with high temperature)
+    contents = [choice.message.content for choice in response.choices]
+    assert len(set(contents)) > 1  # At least some different responses
+
+
+def test_approach_prefix(client):
+    """Test approach prefix in model name"""
+    response = client.chat.completions.create(
+        model="moa-gpt-4o-mini",
+        messages=[
+            {"role": "user", "content": "What is 2+2?"}
+        ],
+        max_tokens=10
+    )
+    
+    assert hasattr(response, 'choices')
+    assert len(response.choices) > 0
+
+
+def test_extra_body_approach(client):
+    """Test approach specification via extra_body"""
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "user", "content": "What is 2+2?"}
+        ],
+        extra_body={"optillm_approach": "bon"},
+        max_tokens=10
+    )
+    
+    assert hasattr(response, 'choices')
+    assert len(response.choices) > 0
+
+
+def test_streaming(client):
+    """Test streaming response"""
+    stream = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {"role": "user", "content": "Count from 1 to 5"}
+        ],
+        stream=True,
+        max_tokens=50
+    )
+    
+    chunks = list(stream)
+    assert len(chunks) > 0
+    # First chunk should have role
+    assert chunks[0].choices[0].delta.role == "assistant"
+    # Later chunks should have content
+    content_chunks = [chunk.choices[0].delta.content for chunk in chunks if chunk.choices[0].delta.content]
+    assert len(content_chunks) > 0
+
+
+if __name__ == "__main__":
+    # Run basic tests if pytest not available
+    client = OpenAI(
+        api_key=os.environ.get("OPENAI_API_KEY", "test-key"),
+        base_url="http://localhost:8000/v1"
+    )
+    
+    print("Running basic API compatibility tests...")
+    
+    try:
+        test_basic_completion(client)
+        print("✅ Basic completion test passed")
+    except Exception as e:
+        print(f"❌ Basic completion test failed: {e}")
+    
+    try:
+        test_n_parameter(client)
+        print("✅ N parameter test passed")
+    except Exception as e:
+        print(f"❌ N parameter test failed: {e}")
+    
+    try:
+        test_approach_prefix(client)
+        print("✅ Approach prefix test passed")
+    except Exception as e:
+        print(f"❌ Approach prefix test failed: {e}")
+    
+    print("\nDone!")
\ No newline at end of file
diff --git a/tests/test_approaches.py b/tests/test_approaches.py
new file mode 100644
index 00000000..10ea67f9
--- /dev/null
+++ b/tests/test_approaches.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+"""
+Simplified approach tests for CI/CD
+Tests the basic structure of approaches without requiring actual model inference
+"""
+
+import pytest
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from optillm.mcts import chat_with_mcts
+from optillm.bon import best_of_n_sampling
+from optillm.moa import mixture_of_agents
+from optillm.self_consistency import advanced_self_consistency_approach
+from optillm.reread import re2_approach
+from optillm.cot_reflection import cot_reflection
+from optillm.plansearch import plansearch
+from optillm.leap import leap
+
+
+class MockClient:
+    """Mock OpenAI client for testing"""
+    def __init__(self):
+        self.chat = self.Chat()
+    
+    class Chat:
+        def __init__(self):
+            self.completions = self.Completions()
+        
+        class Completions:
+            def create(self, **kwargs):
+                class MockChoice:
+                    class Message:
+                        content = "Test response: 2 + 2 = 4"
+                    message = Message()
+                
+                class MockUsage:
+                    completion_tokens = 10
+                    total_tokens = 20
+                
+                class MockResponse:
+                    choices = [MockChoice()]
+                    usage = MockUsage()
+                
+                return MockResponse()
+
+
+def test_approach_imports():
+    """Test that all approaches can be imported"""
+    approaches = [
+        chat_with_mcts,
+        best_of_n_sampling,
+        mixture_of_agents,
+        advanced_self_consistency_approach,
+        re2_approach,
+        cot_reflection,
+        plansearch,
+        leap
+    ]
+    
+    for approach in approaches:
+        assert callable(approach), f"{approach.__name__} is not callable"
+    
+    print("✅ All approaches imported successfully")
+
+
+def test_basic_approach_calls():
+    """Test basic approach calls with mock client"""
+    client = MockClient()
+    system_prompt = "You are a helpful assistant."
+    query = "What is 2 + 2?"
+    model = "mock-model"
+    
+    # Test approaches that should work with mock client
+    simple_approaches = [
+        ("re2_approach", re2_approach),
+        ("cot_reflection", cot_reflection),
+        ("leap", leap),
+    ]
+    
+    for name, approach_func in simple_approaches:
+        try:
+            result = approach_func(system_prompt, query, client, model)
+            assert result is not None, f"{name} returned None"
+            assert isinstance(result, tuple), f"{name} should return a tuple"
+            assert len(result) == 2, f"{name} should return (response, tokens)"
+            print(f"✅ {name} basic test passed")
+        except Exception as e:
+            print(f"❌ {name} basic test failed: {e}")
+
+
+def test_approach_parameters():
+    """Test that approaches handle parameters correctly"""
+    # Test that approaches accept the expected parameters
+    import inspect
+    
+    approaches = {
+        "chat_with_mcts": chat_with_mcts,
+        "best_of_n_sampling": best_of_n_sampling,
+        "mixture_of_agents": mixture_of_agents,
+        "advanced_self_consistency_approach": advanced_self_consistency_approach,
+        "re2_approach": re2_approach,
+        "cot_reflection": cot_reflection,
+        "plansearch": plansearch,
+        "leap": leap,
+    }
+    
+    for name, func in approaches.items():
+        sig = inspect.signature(func)
+        params = list(sig.parameters.keys())
+        
+        # Check required parameters
+        required_params = ["system_prompt", "initial_query", "client", "model"]
+        for param in required_params:
+            assert param in params, f"{name} missing required parameter: {param}"
+        
+        print(f"✅ {name} has correct parameters")
+
+
+if __name__ == "__main__":
+    print("Running approach tests...")
+    
+    test_approach_imports()
+    test_basic_approach_calls()
+    test_approach_parameters()
+    
+    print("\nAll tests completed!")
\ No newline at end of file
diff --git a/test_cases.json b/tests/test_cases.json
similarity index 94%
rename from test_cases.json
rename to tests/test_cases.json
index e557d628..7b10ae43 100644
--- a/test_cases.json
+++ b/tests/test_cases.json
@@ -38,5 +38,10 @@
     "name": "GenSelect Math",
     "system_prompt": "You are a helpful AI assistant with expertise in mathematical reasoning.",
     "query": "A farmer has 17 sheep. All but 9 die. How many sheep does the farmer have left? Explain your reasoning step by step."
+  },
+  {
+    "name": "Simple Math Problem",
+    "system_prompt": "You are a helpful assistant.",
+    "query": "What is 2 + 2?"
   }
 ]
diff --git a/tests/test_ci_quick.py b/tests/test_ci_quick.py
new file mode 100644
index 00000000..332ae409
--- /dev/null
+++ b/tests/test_ci_quick.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""
+Quick CI test to verify basic functionality
+"""
+
+import time
+import sys
+import os
+
+start_time = time.time()
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# Import key modules to ensure they load
+try:
+    from optillm import parse_combined_approach, execute_single_approach, plugin_approaches
+    print("✅ Core optillm module imported successfully")
+except Exception as e:
+    print(f"❌ Failed to import core modules: {e}")
+    sys.exit(1)
+
+# Test importing approach modules
+try:
+    from optillm.mcts import chat_with_mcts
+    from optillm.bon import best_of_n_sampling
+    from optillm.moa import mixture_of_agents
+    print("✅ Approach modules imported successfully")
+except Exception as e:
+    print(f"❌ Failed to import approach modules: {e}")
+
+# Test plugin existence
+try:
+    import optillm.plugins.memory_plugin
+    import optillm.plugins.readurls_plugin
+    import optillm.plugins.privacy_plugin
+    import optillm.plugins.genselect_plugin
+    import optillm.plugins.majority_voting_plugin
+    print("✅ Plugin modules exist and can be imported")
+except Exception as e:
+    print(f"❌ Plugin import test failed: {e}")
+
+# Test approach parsing
+try:
+    # Define known approaches for testing
+    known_approaches = ["moa", "bon", "mcts", "cot_reflection"]
+    plugin_approaches_test = {"memory": True, "readurls": True}
+    
+    test_cases = [
+        ("moa-gpt-4", "SINGLE", ["moa"], "gpt-4"),
+        ("bon|moa|mcts-gpt-4", "OR", ["bon", "moa", "mcts"], "gpt-4"),
+        ("memory&moa-gpt-4", "AND", ["memory", "moa"], "gpt-4"),
+    ]
+    
+    for combined, expected_op, expected_approaches, expected_model in test_cases:
+        operation, approaches, model = parse_combined_approach(combined, known_approaches, plugin_approaches_test)
+        assert operation == expected_op, f"Expected operation {expected_op}, got {operation}"
+        assert approaches == expected_approaches, f"Expected {expected_approaches}, got {approaches}"
+        assert model == expected_model, f"Expected {expected_model}, got {model}"
+    
+    print("✅ Approach parsing tests passed")
+except Exception as e:
+    print(f"❌ Approach parsing test failed: {e}")
+
+print(f"\n✅ All CI quick tests completed!")
+print(f"Total test time: {time.time() - start_time:.2f}s")
\ No newline at end of file
diff --git a/tests/test_n_parameter.py b/tests/test_n_parameter.py
new file mode 100755
index 00000000..31ecbf4f
--- /dev/null
+++ b/tests/test_n_parameter.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+"""
+Test script to verify n parameter works correctly with optillm
+"""
+
+import os
+import sys
+from openai import OpenAI
+import json
+
+def test_n_parameter(model="gpt-4o-mini", n_values=[1, 2, 3]):
+    """
+    Test the n parameter with different values
+    """
+    # Initialize OpenAI client with optillm proxy
+    client = OpenAI(
+        api_key=os.environ.get("OPENAI_API_KEY", ""),
+        base_url="http://localhost:8000/v1"
+    )
+    
+    test_prompt = "Write a haiku about coding"
+    
+    for n in n_values:
+        print(f"\nTesting n={n} with model {model}")
+        print("-" * 50)
+        
+        try:
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "You are a creative poet."},
+                    {"role": "user", "content": test_prompt}
+                ],
+                n=n,
+                temperature=0.8,
+                max_tokens=100
+            )
+            
+            # Check response structure
+            print(f"Response type: {type(response)}")
+            print(f"Number of choices: {len(response.choices)}")
+            
+            # Print all generated responses
+            for i, choice in enumerate(response.choices):
+                print(f"\nChoice {i+1}:")
+                print(choice.message.content)
+            
+            # Verify we got the expected number of responses
+            if len(response.choices) == n:
+                print(f"\n✅ SUCCESS: Got {n} responses as expected")
+            else:
+                print(f"\n❌ FAIL: Expected {n} responses, got {len(response.choices)}")
+                
+        except Exception as e:
+            print(f"\n❌ ERROR: {type(e).__name__}: {str(e)}")
+
+def main():
+    """
+    Main test function
+    """
+    print("Testing n parameter support in optillm")
+    print("=" * 50)
+    
+    # Test with different models if available
+    models_to_test = []
+    
+    # Check for available models
+    if os.environ.get("OPENAI_API_KEY"):
+        models_to_test.append("gpt-4o-mini")
+    
+    # Check for MLX models
+    if os.environ.get("OPTILLM_API_KEY") == "optillm":
+        # Add MLX model if running with local inference
+        models_to_test.append("Qwen/Qwen2.5-1.5B-Instruct")
+    
+    if not models_to_test:
+        print("No models available to test. Set OPENAI_API_KEY or OPTILLM_API_KEY=optillm")
+        return
+    
+    for model in models_to_test:
+        print(f"\n\nTesting model: {model}")
+        print("=" * 50)
+        test_n_parameter(model)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/tests/test_plugins.py b/tests/test_plugins.py
new file mode 100644
index 00000000..49fab049
--- /dev/null
+++ b/tests/test_plugins.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+"""
+Test plugin functionality
+"""
+
+import pytest
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from optillm.plugins import load_plugin, is_plugin_approach
+from optillm.plugins.memory_plugin import should_enable_memory
+
+
+def test_plugin_loading():
+    """Test loading plugins"""
+    # Test loading a known plugin
+    plugin = load_plugin("memory")
+    assert plugin is not None
+    assert hasattr(plugin, 'run')
+    
+    # Test loading non-existent plugin returns None
+    plugin = load_plugin("nonexistent")
+    assert plugin is None
+
+
+def test_is_plugin_approach():
+    """Test plugin approach detection"""
+    # Known plugins
+    assert is_plugin_approach("memory") == True
+    assert is_plugin_approach("readurls") == True
+    assert is_plugin_approach("privacy") == True
+    
+    # Non-plugins
+    assert is_plugin_approach("mcts") == False
+    assert is_plugin_approach("bon") == False
+    assert is_plugin_approach("nonexistent") == False
+
+
+def test_memory_plugin_detection():
+    """Test memory plugin auto-detection"""
+    # Test with context length exceeding threshold
+    long_context = "x" * 500000  # 500k chars
+    assert should_enable_memory(long_context) == True
+    
+    # Test with short context
+    short_context = "Hello world"
+    assert should_enable_memory(short_context) == False
+    
+    # Test with explicit false in config
+    assert should_enable_memory(long_context, {"memory": False}) == False
+    
+    # Test with explicit true in config
+    assert should_enable_memory(short_context, {"memory": True}) == True
+
+
+def test_genselect_plugin():
+    """Test genselect plugin exists"""
+    plugin = load_plugin("genselect")
+    assert plugin is not None
+    assert hasattr(plugin, 'run')
+    assert hasattr(plugin, 'DEFAULT_NUM_CANDIDATES')
+
+
+def test_majority_voting_plugin():
+    """Test majority voting plugin"""
+    plugin = load_plugin("majority_voting")
+    assert plugin is not None
+    assert hasattr(plugin, 'run')
+    assert hasattr(plugin, 'extract_answer')
+    assert hasattr(plugin, 'normalize_answer')
+
+
+if __name__ == "__main__":
+    print("Running plugin tests...")
+    
+    try:
+        test_plugin_loading()
+        print("✅ Plugin loading test passed")
+    except Exception as e:
+        print(f"❌ Plugin loading test failed: {e}")
+    
+    try:
+        test_is_plugin_approach()
+        print("✅ Plugin approach detection test passed")
+    except Exception as e:
+        print(f"❌ Plugin approach detection test failed: {e}")
+    
+    try:
+        test_memory_plugin_detection()
+        print("✅ Memory plugin detection test passed")
+    except Exception as e:
+        print(f"❌ Memory plugin detection test failed: {e}")
+    
+    try:
+        test_genselect_plugin()
+        print("✅ GenSelect plugin test passed")
+    except Exception as e:
+        print(f"❌ GenSelect plugin test failed: {e}")
+    
+    try:
+        test_majority_voting_plugin()
+        print("✅ Majority voting plugin test passed")
+    except Exception as e:
+        print(f"❌ Majority voting plugin test failed: {e}")
+    
+    print("\nDone!")
\ No newline at end of file

From a2cd56c6f469ac3119a63c44d45d0a5f2bafd4de Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Tue, 22 Jul 2025 14:11:38 +0800
Subject: [PATCH 06/10] Update test_plugins.py

---
 tests/test_plugins.py | 108 +++++++++++++++++++++++-------------------
 1 file changed, 58 insertions(+), 50 deletions(-)

diff --git a/tests/test_plugins.py b/tests/test_plugins.py
index 49fab049..b863e197 100644
--- a/tests/test_plugins.py
+++ b/tests/test_plugins.py
@@ -3,94 +3,102 @@
 Test plugin functionality
 """
 
-import pytest
 import sys
 import os
+import importlib
+
+# Try to import pytest, but don't fail if it's not available
+try:
+    import pytest
+except ImportError:
+    pytest = None
+
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from optillm.plugins import load_plugin, is_plugin_approach
-from optillm.plugins.memory_plugin import should_enable_memory
+from optillm import plugin_approaches, load_plugins
 
 
-def test_plugin_loading():
-    """Test loading plugins"""
-    # Test loading a known plugin
-    plugin = load_plugin("memory")
-    assert plugin is not None
-    assert hasattr(plugin, 'run')
+def test_plugin_module_imports():
+    """Test that plugin modules can be imported"""
+    plugin_modules = [
+        'optillm.plugins.memory_plugin',
+        'optillm.plugins.readurls_plugin', 
+        'optillm.plugins.privacy_plugin',
+        'optillm.plugins.genselect_plugin',
+        'optillm.plugins.majority_voting_plugin'
+    ]
     
-    # Test loading non-existent plugin returns None
-    plugin = load_plugin("nonexistent")
-    assert plugin is None
+    for module_name in plugin_modules:
+        try:
+            module = importlib.import_module(module_name)
+            assert hasattr(module, 'run'), f"{module_name} missing 'run' function"
+            assert hasattr(module, 'SLUG'), f"{module_name} missing 'SLUG' attribute"
+        except ImportError as e:
+            if pytest:
+                pytest.fail(f"Failed to import {module_name}: {e}")
+            else:
+                raise AssertionError(f"Failed to import {module_name}: {e}")
 
 
-def test_is_plugin_approach():
-    """Test plugin approach detection"""
-    # Known plugins
-    assert is_plugin_approach("memory") == True
-    assert is_plugin_approach("readurls") == True
-    assert is_plugin_approach("privacy") == True
+def test_plugin_approach_detection():
+    """Test plugin approach detection after loading"""
+    # Load plugins first
+    load_plugins()
     
-    # Non-plugins
-    assert is_plugin_approach("mcts") == False
-    assert is_plugin_approach("bon") == False
-    assert is_plugin_approach("nonexistent") == False
+    # Check if known plugins are loaded
+    expected_plugins = ["memory", "readurls", "privacy"]
+    for plugin_name in expected_plugins:
+        assert plugin_name in plugin_approaches, f"Plugin {plugin_name} not loaded"
 
 
-def test_memory_plugin_detection():
-    """Test memory plugin auto-detection"""
-    # Test with context length exceeding threshold
-    long_context = "x" * 500000  # 500k chars
-    assert should_enable_memory(long_context) == True
-    
-    # Test with short context
-    short_context = "Hello world"
-    assert should_enable_memory(short_context) == False
-    
-    # Test with explicit false in config
-    assert should_enable_memory(long_context, {"memory": False}) == False
-    
-    # Test with explicit true in config
-    assert should_enable_memory(short_context, {"memory": True}) == True
+def test_memory_plugin_structure():
+    """Test memory plugin has required structure"""
+    import optillm.plugins.memory_plugin as plugin
+    assert hasattr(plugin, 'run')
+    assert hasattr(plugin, 'SLUG')
+    assert plugin.SLUG == "memory"
+    assert hasattr(plugin, 'Memory')  # Check for Memory class
 
 
 def test_genselect_plugin():
-    """Test genselect plugin exists"""
-    plugin = load_plugin("genselect")
-    assert plugin is not None
+    """Test genselect plugin module"""
+    import optillm.plugins.genselect_plugin as plugin
     assert hasattr(plugin, 'run')
+    assert hasattr(plugin, 'SLUG')
     assert hasattr(plugin, 'DEFAULT_NUM_CANDIDATES')
+    assert plugin.SLUG == "genselect"
 
 
 def test_majority_voting_plugin():
-    """Test majority voting plugin"""
-    plugin = load_plugin("majority_voting")
-    assert plugin is not None
+    """Test majority voting plugin module"""
+    import optillm.plugins.majority_voting_plugin as plugin
     assert hasattr(plugin, 'run')
+    assert hasattr(plugin, 'SLUG')
     assert hasattr(plugin, 'extract_answer')
     assert hasattr(plugin, 'normalize_answer')
+    assert plugin.SLUG == "majority_voting"
 
 
 if __name__ == "__main__":
     print("Running plugin tests...")
     
     try:
-        test_plugin_loading()
-        print("✅ Plugin loading test passed")
+        test_plugin_module_imports()
+        print("✅ Plugin module imports test passed")
     except Exception as e:
-        print(f"❌ Plugin loading test failed: {e}")
+        print(f"❌ Plugin module imports test failed: {e}")
     
     try:
-        test_is_plugin_approach()
+        test_plugin_approach_detection()
         print("✅ Plugin approach detection test passed")
     except Exception as e:
         print(f"❌ Plugin approach detection test failed: {e}")
     
     try:
-        test_memory_plugin_detection()
-        print("✅ Memory plugin detection test passed")
+        test_memory_plugin_structure()
+        print("✅ Memory plugin structure test passed")
     except Exception as e:
-        print(f"❌ Memory plugin detection test failed: {e}")
+        print(f"❌ Memory plugin structure test failed: {e}")
     
     try:
         test_genselect_plugin()

From e403abed0fd49326d915c70f619ee573a14a6887 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Tue, 22 Jul 2025 22:14:07 +0800
Subject: [PATCH 07/10] Enhance majority voting plugin and update eval config

Upgrades the majority voting plugin with category-aware answer extraction, adaptive temperature control, improved normalization, response quality filtering, and smart fallback strategies. Updates the default test-time compute configuration in eval_optillmbench.py to use 5 candidates instead of 8 for fairer comparison and memory efficiency, and revises related reporting logic and documentation.
---
 optillm/plugins/majority_voting_plugin.py | 473 ++++++++++++++--------
 scripts/eval_optillmbench.py              |  68 ++--
 2 files changed, 335 insertions(+), 206 deletions(-)

diff --git a/optillm/plugins/majority_voting_plugin.py b/optillm/plugins/majority_voting_plugin.py
index 311072b7..d1faaf2d 100644
--- a/optillm/plugins/majority_voting_plugin.py
+++ b/optillm/plugins/majority_voting_plugin.py
@@ -1,12 +1,12 @@
 """
-Majority Voting Plugin for OptILLM
+Majority Voting Plugin V2 for OptILLM
 
-This plugin implements a majority voting approach where k candidate solutions
-are generated and the most frequent answer is selected. This is particularly
-effective for problems with discrete answers (math, coding, multiple choice).
-
-The plugin uses the OpenAI API's n parameter to generate multiple responses
-efficiently in a single API call.
+Enhanced version with:
+- Category-aware answer extraction
+- Adaptive temperature control
+- Improved answer normalization
+- Response quality filtering
+- Smart fallback strategies
 """
 
 import re
@@ -14,45 +14,136 @@
 from typing import Tuple, Dict, Any, List, Optional
 from collections import Counter
 import json
+from fractions import Fraction
 
 logger = logging.getLogger(__name__)
 
 # Plugin identifier
 SLUG = "majority_voting"
 
-# Default number of candidates to generate
-DEFAULT_K = 6
+# Default configuration
+DEFAULT_K = 8
+DEFAULT_TEMPERATURE = 0.3  # Lower for better consistency
 
-# Default temperature for candidate generation
-DEFAULT_TEMPERATURE = 0.6
+# Category-specific temperatures
+CATEGORY_TEMPERATURES = {
+    "gsm8k": 0.2,      # Math needs precision
+    "mmlu_math": 0.3,  # Multiple choice math
+    "boolq": 0.3,      # Boolean questions
+    "aqua_rat": 0.3,   # Reasoning with choices
+    "default": 0.3     # General default
+}
 
-def extract_answer(text: str) -> Optional[str]:
+def detect_category(query: str) -> str:
+    """
+    Try to detect the problem category from the query.
+    
+    Returns:
+        Category string or 'default' if unknown
     """
-    Extract the answer from a response text.
+    query_lower = query.lower()
+    
+    # GSM8K patterns
+    if "###" in query or ("calculate" in query_lower and any(word in query_lower for word in ["total", "sum", "difference", "product"])):
+        return "gsm8k"
+    
+    # MMLU patterns (multiple choice)
+    if re.search(r'\b[A-E]\s*[:\)]\s*', query) or "which of the following" in query_lower:
+        return "mmlu_math"
+    
+    # BoolQ patterns
+    if query_lower.strip().endswith("?") and any(word in query_lower for word in ["is", "are", "was", "were", "does", "do", "did", "can", "could", "will", "would"]):
+        return "boolq"
+    
+    # AQUA-RAT patterns
+    if re.search(r'options?:\s*[A-E]', query, re.IGNORECASE):
+        return "aqua_rat"
     
-    This function looks for common answer patterns in the response:
-    1. Text after "Answer:" or "Final Answer:"
-    2. Text within \\boxed{} (LaTeX format)
-    3. Numbers at the end of the response
-    4. The last line if it's short (likely the answer)
+    return "default"
+
+
+def extract_answer_by_category(text: str, category: str) -> Optional[str]:
+    """
+    Extract answer based on problem category.
     
     Args:
-        text: The response text to extract answer from
+        text: Response text
+        category: Problem category
         
     Returns:
-        The extracted answer or None if no clear answer found
+        Extracted answer or None
+    """
+    text = text.strip()
+    
+    if category == "gsm8k":
+        # Look for ### pattern specifically
+        match = re.search(r'###\s*(-?\d*\.?\d+)', text)
+        if match:
+            return match.group(1)
+        
+        # Fallback: look for "answer is" pattern with number
+        match = re.search(r'answer\s+is\s*:?\s*\$?(-?\d*\.?\d+)', text, re.IGNORECASE)
+        if match:
+            return match.group(1)
+            
+    elif category == "mmlu_math":
+        # Look for letter choices first
+        patterns = [
+            r'\b([A-E])\b(?:\s*\)|:|\.)?(?:\s|$)',  # Letter with optional punctuation
+            r'(?:answer|choice|option)\s*(?:is\s*)?:?\s*([A-E])\b',
+            r'^([A-E])$',  # Just a letter
+            r'\b([0-3])\b(?:\s*\)|:|\.)?(?:\s|$)',  # Index (0-3)
+        ]
+        
+        for pattern in patterns:
+            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
+            if match:
+                return match.group(1)
+                
+    elif category == "boolq":
+        # Extract boolean answers
+        text_lower = text.lower()
+        
+        # Direct true/false
+        if re.search(r'\b(true|false)\b', text_lower):
+            match = re.search(r'\b(true|false)\b', text_lower)
+            return match.group(1)
+        
+        # Yes/no
+        if re.search(r'\b(yes|no)\b', text_lower):
+            match = re.search(r'\b(yes|no)\b', text_lower)
+            return match.group(1)
+            
+    elif category == "aqua_rat":
+        # Similar to MMLU but may have more complex patterns
+        patterns = [
+            r'(?:answer|option)\s*(?:is\s*)?:?\s*\(?([A-E])\)?',
+            r'\b([A-E])\s*\)',
+            r'^([A-E])$',
+        ]
+        
+        for pattern in patterns:
+            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
+            if match:
+                return match.group(1)
+    
+    # If category-specific extraction fails, fall back to generic
+    return extract_answer(text)
+
+
+def extract_answer(text: str) -> Optional[str]:
+    """
+    Generic answer extraction fallback.
+    Enhanced from original version.
     """
-    # Remove any trailing whitespace
     text = text.strip()
     
-    # Pattern 1: Look for LaTeX boxed format first (handle both \boxed and \\boxed)
+    # LaTeX boxed format
     boxed_match = re.search(r'\\{1,2}boxed\{([^}]+)\}', text)
     if boxed_match:
-        answer = boxed_match.group(1).strip()
-        logger.debug(f"Extracted boxed answer: {answer}")
-        return answer
+        return boxed_match.group(1).strip()
     
-    # Pattern 2: Look for "Answer:" or "Final Answer:" patterns
+    # Answer patterns
     answer_patterns = [
         r'(?:final\s+)?answer\s*[:=]\s*(.+?)(?:\n|$)',
         r'(?:the\s+)?(?:final\s+)?answer\s+is\s*[:=]?\s*(.+?)(?:\n|$)',
@@ -62,98 +153,156 @@ def extract_answer(text: str) -> Optional[str]:
     for pattern in answer_patterns:
         match = re.search(pattern, text, re.IGNORECASE)
         if match:
-            answer = match.group(1).strip()
-            # Clean up the answer
-            answer = answer.rstrip('.,;')
+            answer = match.group(1).strip().rstrip('.,;')
             if answer:
-                logger.debug(f"Extracted answer using pattern: {answer}")
                 return answer
     
-    # Pattern 3: Look for standalone numbers (useful for math problems)
-    # Check the last few lines for a number
+    # Check last line if short
     lines = text.split('\n')
-    for line in reversed(lines[-3:]):  # Check last 3 lines
-        line = line.strip()
-        # Match numbers (including decimals, fractions, negative numbers)
-        number_match = re.match(r'^-?\d+\.?\d*$|^-?\d+/\d+$', line)
-        if number_match:
-            logger.debug(f"Extracted number answer: {line}")
-            return line
-    
-    # Pattern 4: For multiple choice, look for single letter answers
-    # Check this before the generic last line check
-    mc_patterns = [
-        r'(?:the\s+)?(?:correct\s+)?(?:answer|option)\s+is\s+([A-E])(?:\b|$)',
-        r'(?:choose|select|pick)\s+(?:option\s+)?([A-E])(?:\b|$)',
-        r'\b([A-E])\s*\)\s*[A-Za-z]+.*is\s+(?:the\s+)?(?:correct|right)',
-        r'^([A-E])$',  # Just a letter on its own line
-    ]
-    
-    for pattern in mc_patterns:
-        mc_match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
-        if mc_match:
-            answer = mc_match.group(1).upper()
-            logger.debug(f"Extracted multiple choice answer: {answer}")
-            return answer
-    
-    # Pattern 5: If the last line is short (< 50 chars), it might be the answer
     if lines:
         last_line = lines[-1].strip()
         if last_line and len(last_line) < 50 and not last_line.endswith(':'):
-            logger.debug(f"Using last line as answer: {last_line}")
             return last_line
     
-    logger.warning("Could not extract a clear answer from the response")
     return None
 
-def normalize_answer(answer: str) -> str:
+
+def normalize_answer_enhanced(answer: str, category: str = "default") -> str:
     """
-    Normalize an answer for comparison.
-    
-    This helps ensure that equivalent answers are treated as the same:
-    - Converts to lowercase
-    - Removes extra whitespace
-    - Removes quotes
-    - Normalizes number formats
+    Enhanced answer normalization with category awareness.
     
     Args:
-        answer: The answer to normalize
+        answer: Raw answer text
+        category: Problem category for specific normalization
         
     Returns:
-        The normalized answer
+        Normalized answer
     """
-    # Convert to lowercase
+    if not answer:
+        return ""
+        
+    # Basic normalization
     answer = answer.lower().strip()
-    
-    # Remove quotes
     answer = answer.strip('"\'')
-    
-    # Normalize whitespace
     answer = ' '.join(answer.split())
     
-    # Try to normalize numbers
-    try:
-        # Check if it's a float
-        if '.' in answer:
+    # Category-specific normalization
+    if category in ["gsm8k", "mmlu_math"] and re.match(r'^-?\d*\.?\d+$', answer):
+        # Numeric normalization
+        try:
+            # Handle different number formats
+            answer = answer.replace(',', '')  # Remove commas
+            
+            # Convert to float for consistent representation
             num = float(answer)
-            # Format to remove trailing zeros
-            answer = f"{num:g}"
-        else:
-            # Try integer
-            num = int(answer)
-            answer = str(num)
-    except ValueError:
-        # Not a number, keep as is
-        pass
-    
-    # Handle yes/no variations
-    if answer in ['yes', 'yeah', 'yep', 'true', 'correct']:
-        answer = 'yes'
-    elif answer in ['no', 'nope', 'false', 'incorrect']:
-        answer = 'no'
+            
+            # Handle integers
+            if num.is_integer():
+                return str(int(num))
+            else:
+                # Format to remove trailing zeros
+                return f"{num:g}"
+                
+        except ValueError:
+            pass
+    
+    elif category == "mmlu_math":
+        # Ensure single letter answers are uppercase
+        if len(answer) == 1 and answer.isalpha():
+            return answer.upper()
+        
+        # Extract letter from "option A", "choice B", etc.
+        match = re.match(r'(?:option|choice|answer)\s*([a-e])', answer, re.IGNORECASE)
+        if match:
+            return match.group(1).upper()
+            
+    elif category == "boolq":
+        # Boolean normalization
+        true_values = ['yes', 'true', 'correct', '1', 't', 'y']
+        false_values = ['no', 'false', 'incorrect', '0', 'f', 'n']
+        
+        if answer in true_values:
+            return 'true'
+        elif answer in false_values:
+            return 'false'
+    
+    # Handle mathematical expressions
+    if category in ["gsm8k", "mmlu_math"]:
+        # Try to evaluate simple fractions
+        fraction_match = re.match(r'^(\d+)/(\d+)$', answer)
+        if fraction_match:
+            try:
+                frac = Fraction(int(fraction_match.group(1)), int(fraction_match.group(2)))
+                return str(float(frac))
+            except:
+                pass
+        
+        # Handle percentages
+        percent_match = re.match(r'^(\d*\.?\d+)%$', answer)
+        if percent_match:
+            try:
+                return str(float(percent_match.group(1)) / 100)
+            except:
+                pass
     
     return answer
 
+
+def score_response_quality(response: str, category: str) -> float:
+    """
+    Score response quality for weighted voting.
+    
+    Returns:
+        Quality score between 0 and 1
+    """
+    if not response:
+        return 0.0
+    
+    score = 1.0
+    
+    # Check for completeness
+    if len(response.strip()) < 10:
+        score *= 0.5
+    
+    # Check for uncertainty markers
+    uncertainty_words = ['maybe', 'probably', 'might', 'could be', 'not sure', 'guess']
+    for word in uncertainty_words:
+        if word in response.lower():
+            score *= 0.7
+    
+    # Category-specific checks
+    if category == "gsm8k":
+        # Should have ### marker
+        if "###" not in response:
+            score *= 0.8
+    elif category in ["mmlu_math", "aqua_rat"]:
+        # Should have clear choice indication
+        if not re.search(r'\b[A-E]\b', response):
+            score *= 0.8
+    
+    # Check if response seems cut off
+    if response.strip().endswith(('...', 'Therefore,', 'So,', 'Thus,')):
+        score *= 0.5
+    
+    return score
+
+
+def add_self_consistency_prompt(system_prompt: str, query: str, category: str) -> str:
+    """
+    Add format instructions to encourage consistency.
+    """
+    format_instructions = {
+        "gsm8k": "\n\nIMPORTANT: After showing your work, provide your final numerical answer after ### on a new line.",
+        "mmlu_math": "\n\nIMPORTANT: After your reasoning, clearly state your choice as a single letter (A, B, C, D, or E).",
+        "boolq": "\n\nIMPORTANT: After your analysis, clearly state your answer as either 'true' or 'false'.",
+        "aqua_rat": "\n\nIMPORTANT: After solving the problem, clearly indicate your choice with the letter (A, B, C, D, or E).",
+        "default": "\n\nIMPORTANT: Clearly state your final answer at the end of your response."
+    }
+    
+    instruction = format_instructions.get(category, format_instructions["default"])
+    return system_prompt + instruction
+
+
 def run(
     system_prompt: str,
     initial_query: str,
@@ -162,45 +311,40 @@ def run(
     request_config: Dict[str, Any] = None
 ) -> Tuple[str, int]:
     """
-    Main entry point for the majority voting plugin.
+    Enhanced majority voting with category awareness and better extraction.
+    """
+    logger.info("Starting enhanced majority voting process")
     
-    Generates k candidate solutions and returns the most frequent answer.
+    # Detect category
+    category = detect_category(initial_query)
+    logger.info(f"Detected category: {category}")
     
-    Args:
-        system_prompt: System prompt for the model
-        initial_query: User's query
-        client: OpenAI-compatible client instance
-        model: Model identifier
-        request_config: Additional configuration parameters
-        
-    Returns:
-        Tuple of (response_text, completion_tokens_used)
-    """
-    logger.info("Starting majority voting process")
+    # Extract parameters
+    k = request_config.get('k', DEFAULT_K) if request_config else DEFAULT_K
+    
+    # Use category-specific temperature
+    base_temperature = CATEGORY_TEMPERATURES.get(category, DEFAULT_TEMPERATURE)
+    temperature = request_config.get('temperature', base_temperature) if request_config else base_temperature
     
-    # Extract parameters from request_config
-    k = DEFAULT_K
-    temperature = DEFAULT_TEMPERATURE
+    max_tokens = request_config.get('max_tokens', 4096) if request_config else 4096
     
-    if request_config:
-        k = request_config.get('k', DEFAULT_K)
-        # Allow overriding temperature if needed
-        temperature = request_config.get('temperature', DEFAULT_TEMPERATURE)
-        # Respect max_tokens if provided
-        max_tokens = request_config.get('max_tokens', 4096)
-    else:
-        max_tokens = 4096
+    logger.info(f"Generating {k} candidates with temperature={temperature} for category={category}")
     
-    logger.info(f"Generating {k} candidates with temperature={temperature}")
+    # Add self-consistency prompt
+    enhanced_system_prompt = add_self_consistency_prompt(system_prompt, initial_query, category)
     
     # Prepare messages
     messages = [
-        {"role": "system", "content": system_prompt},
+        {"role": "system", "content": enhanced_system_prompt},
         {"role": "user", "content": initial_query}
     ]
     
+    # Generate candidates
+    candidates = []
+    total_tokens = 0
+    
     try:
-        # Generate k candidates in a single API call using n parameter
+        # Try parallel generation first
         response = client.chat.completions.create(
             model=model,
             messages=messages,
@@ -209,20 +353,12 @@ def run(
             max_tokens=max_tokens
         )
         
-        # Extract all candidate responses
         candidates = [choice.message.content for choice in response.choices]
         total_tokens = response.usage.completion_tokens
         
-        logger.info(f"Generated {len(candidates)} candidates using n parameter. Tokens used: {total_tokens}")
-        
     except Exception as e:
-        logger.warning(f"n parameter not supported by provider: {str(e)}")
-        logger.info(f"Falling back to generating {k} candidates one by one")
-        
-        # Fallback: Generate candidates one by one in a loop
-        candidates = []
-        total_tokens = 0
-        
+        logger.warning(f"Parallel generation failed: {str(e)}")
+        # Fallback to sequential
         for i in range(k):
             try:
                 response = client.chat.completions.create(
@@ -233,61 +369,54 @@ def run(
                 )
                 candidates.append(response.choices[0].message.content)
                 total_tokens += response.usage.completion_tokens
-                logger.debug(f"Generated candidate {i+1}/{k}")
-                
-            except Exception as fallback_error:
-                logger.error(f"Error generating candidate {i+1}: {str(fallback_error)}")
+            except Exception as err:
+                logger.error(f"Error generating candidate {i+1}: {str(err)}")
                 continue
-        
-        if not candidates:
-            logger.error("Failed to generate any candidates")
-            return "Error: Could not generate any candidates", 0
-        
-        logger.info(f"Generated {len(candidates)} candidates using fallback method. Total tokens used: {total_tokens}")
     
-    # Extract answers from each candidate
-    answers = []
-    answer_to_response = {}  # Map normalized answers to full responses
+    if not candidates:
+        return "Error: Could not generate any candidates", 0
+    
+    # Extract and normalize answers with quality scores
+    answer_data = []  # List of (normalized_answer, raw_answer, response, quality_score)
     
     for i, candidate in enumerate(candidates):
-        answer = extract_answer(candidate)
+        # Extract answer using category-aware extraction
+        answer = extract_answer_by_category(candidate, category)
+        
         if answer:
-            normalized = normalize_answer(answer)
-            answers.append(normalized)
-            # Keep the first full response for each unique answer
-            if normalized not in answer_to_response:
-                answer_to_response[normalized] = candidate
-            logger.debug(f"Candidate {i+1} answer: {answer} (normalized: {normalized})")
+            normalized = normalize_answer_enhanced(answer, category)
+            quality = score_response_quality(candidate, category)
+            answer_data.append((normalized, answer, candidate, quality))
+            logger.debug(f"Candidate {i+1}: {answer} -> {normalized} (quality: {quality:.2f})")
         else:
             logger.warning(f"Could not extract answer from candidate {i+1}")
     
-    if not answers:
-        logger.warning("No answers could be extracted from any candidate")
-        # Return the first candidate as fallback
-        return candidates[0] if candidates else "Error: No candidates generated", total_tokens
+    if not answer_data:
+        # Fallback: return highest quality response
+        quality_scores = [(score_response_quality(c, category), c) for c in candidates]
+        quality_scores.sort(reverse=True)
+        return quality_scores[0][1], total_tokens
     
-    # Count answer frequencies
-    answer_counts = Counter(answers)
-    logger.info(f"Answer distribution: {dict(answer_counts)}")
+    # Count weighted votes
+    weighted_votes = Counter()
+    answer_to_response = {}
     
-    # Get the most common answer
-    most_common_answer, count = answer_counts.most_common(1)[0]
-    confidence = count / len(answers)
+    for normalized, raw, response, quality in answer_data:
+        weighted_votes[normalized] += quality
+        # Keep the highest quality response for each answer
+        if normalized not in answer_to_response or quality > answer_to_response[normalized][1]:
+            answer_to_response[normalized] = (response, quality)
     
-    logger.info(f"Most common answer: '{most_common_answer}' with {count}/{len(answers)} votes ({confidence:.1%} confidence)")
+    # Get the answer with highest weighted votes
+    most_common_answer, weighted_score = weighted_votes.most_common(1)[0]
     
-    # Get the full response corresponding to the most common answer
-    winning_response = answer_to_response.get(most_common_answer, candidates[0])
+    # Calculate confidence
+    total_weight = sum(weighted_votes.values())
+    confidence = weighted_score / total_weight if total_weight > 0 else 0
     
-    # Log voting summary to console instead of adding to response
-    logger.info("Majority Voting Summary:")
-    logger.info(f"  - Generated {len(candidates)} candidates")
-    logger.info(f"  - Most common answer: {most_common_answer}")
-    logger.info(f"  - Votes: {count}/{len(answers)} ({confidence:.1%} confidence)")
+    logger.info(f"Most common answer: '{most_common_answer}' with weighted score {weighted_score:.2f} ({confidence:.1%} confidence)")
     
-    if len(answer_counts) > 1:
-        other_answers = [f"{ans} ({cnt} votes)" for ans, cnt in answer_counts.items() if ans != most_common_answer]
-        logger.info(f"  - Other answers: {', '.join(other_answers)}")
+    # Return the best response for the winning answer
+    winning_response = answer_to_response[most_common_answer][0]
     
-    # Return only the full response from the winning answer
     return winning_response, total_tokens
\ No newline at end of file
diff --git a/scripts/eval_optillmbench.py b/scripts/eval_optillmbench.py
index 1ad701f5..e1073204 100644
--- a/scripts/eval_optillmbench.py
+++ b/scripts/eval_optillmbench.py
@@ -68,12 +68,12 @@
 ]
 
 # Default test-time compute configuration for standard evaluation
-# Using n=8 for all approaches to ensure fair comparison and memory efficiency
+# Using n=5 for all approaches to ensure fair comparison and memory efficiency
 DEFAULT_TEST_TIME_COMPUTE = [
-    ("avg@8", "Average of 8 parallel responses", {"n": 8}),
-    ("pass@8", "Pass@8 - success if any of 8 is correct", {"n": 8}),
-    ("maj@8", "Majority Voting with k=8", {"k": 8}),
-    ("genselect@8", "GenSelect with 8 candidates", {"num_candidates": 8})
+    ("avg@5", "Average of 5 parallel responses", {"n": 5}),
+    ("pass@5", "Pass@5 - success if any of 5 is correct", {"n": 5}),
+    ("maj@5", "Majority Voting with k=5", {"k": 5}),
+    ("genselect@5", "GenSelect with 5 candidates", {"num_candidates": 5})
 ]
 
 def load_optillm_bench() -> datasets.Dataset:
@@ -753,7 +753,7 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, i
     report = []
     
     # Check if this is the default test-time compute evaluation
-    is_default_test_time = set(all_metrics.keys()) == {"avg@8", "pass@8", "maj@8", "genselect@8"}
+    is_default_test_time = set(all_metrics.keys()) == {"avg@5", "pass@5", "maj@5", "genselect@5"}
     
     # Header
     if is_default_test_time:
@@ -769,11 +769,11 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, i
     if is_default_test_time:
         report.append("## Test-Time Compute Evaluation Results\n")
         report.append("This report evaluates the potential of test-time compute with:")
-        report.append("- **avg@8**: Average success rate of 8 parallel responses")
-        report.append("- **pass@8**: Success if ANY of 8 responses is correct")
-        report.append("- **maj@8**: Majority voting with 8 candidates")
-        report.append("- **genselect@8**: Quality-based selection from 8 candidates\n")
-        report.append("All approaches use n=8 parallel generation (with sequential fallback) for fair comparison.\n")
+        report.append("- **avg@5**: Average success rate of 5 parallel responses")
+        report.append("- **pass@5**: Success if ANY of 5 responses is correct")
+        report.append("- **maj@5**: Majority voting with 5 candidates")
+        report.append("- **genselect@5**: Quality-based selection from 5 candidates\n")
+        report.append("All approaches use n=5 parallel generation (with sequential fallback) for fair comparison.\n")
     elif is_test_time_compute:
         report.append("This report evaluates test-time compute scaling approaches:")
         report.append("- **Sequential scaling**: ThinkDeeper with varying thinking token budgets")
@@ -819,34 +819,34 @@ def generate_report(all_metrics: Dict[str, Dict[str, float]], output_dir: str, i
     # Add summary section for default test-time compute
     if is_default_test_time:
         report.append("\n## Summary")
-        if all(metric in all_metrics for metric in ["avg@8", "pass@8", "maj@8", "genselect@8"]):
-            avg8_acc = all_metrics["avg@8"]["accuracy"] * 100
-            pass8_acc = all_metrics["pass@8"]["accuracy"] * 100
-            maj8_acc = all_metrics["maj@8"]["accuracy"] * 100
-            genselect8_acc = all_metrics["genselect@8"]["accuracy"] * 100
+        if all(metric in all_metrics for metric in ["avg@5", "pass@5", "maj@5", "genselect@5"]):
+            avg5_acc = all_metrics["avg@5"]["accuracy"] * 100
+            pass5_acc = all_metrics["pass@5"]["accuracy"] * 100
+            maj5_acc = all_metrics["maj@5"]["accuracy"] * 100
+            genselect5_acc = all_metrics["genselect@5"]["accuracy"] * 100
             
             report.append(f"\n**Key Metrics:**")
-            report.append(f"- **avg@8** (average of 8 responses): {avg8_acc:.2f}%")
-            report.append(f"- **pass@8** (success if any correct): {pass8_acc:.2f}%")
-            report.append(f"- **maj@8** (majority voting): {maj8_acc:.2f}%")
-            report.append(f"- **genselect@8** (quality-based selection): {genselect8_acc:.2f}%")
+            report.append(f"- **avg@5** (average of 5 responses): {avg5_acc:.2f}%")
+            report.append(f"- **pass@5** (success if any correct): {pass5_acc:.2f}%")
+            report.append(f"- **maj@5** (majority voting): {maj5_acc:.2f}%")
+            report.append(f"- **genselect@5** (quality-based selection): {genselect5_acc:.2f}%")
             
-            # Calculate improvements over baseline (avg@8)
-            if avg8_acc > 0:
-                pass_improvement = ((pass8_acc - avg8_acc) / avg8_acc) * 100
-                maj_improvement = ((maj8_acc - avg8_acc) / avg8_acc) * 100
-                genselect_improvement = ((genselect8_acc - avg8_acc) / avg8_acc) * 100
-                
-                report.append(f"\n**Improvements over avg@8 baseline:**")
-                report.append(f"- pass@8: {'+' if pass_improvement > 0 else ''}{pass_improvement:.1f}%")
-                report.append(f"- maj@8: {'+' if maj_improvement > 0 else ''}{maj_improvement:.1f}%")
-                report.append(f"- genselect@8: {'+' if genselect_improvement > 0 else ''}{genselect_improvement:.1f}%")
+            # Calculate improvements over baseline (avg@5)
+            if avg5_acc > 0:
+                pass_improvement = ((pass5_acc - avg5_acc) / avg5_acc) * 100
+                maj_improvement = ((maj5_acc - avg5_acc) / avg5_acc) * 100
+                genselect_improvement = ((genselect5_acc - avg5_acc) / avg5_acc) * 100
+                
+                report.append(f"\n**Improvements over avg@5 baseline:**")
+                report.append(f"- pass@5: {'+' if pass_improvement > 0 else ''}{pass_improvement:.1f}%")
+                report.append(f"- maj@5: {'+' if maj_improvement > 0 else ''}{maj_improvement:.1f}%")
+                report.append(f"- genselect@5: {'+' if genselect_improvement > 0 else ''}{genselect_improvement:.1f}%")
             
             # Show variance indicator
-            if pass8_acc > avg8_acc:
-                variance_ratio = (pass8_acc - avg8_acc) / avg8_acc * 100
+            if pass5_acc > avg5_acc:
+                variance_ratio = (pass5_acc - avg5_acc) / avg5_acc * 100
                 report.append(f"\n**Response Variance Indicator:**")
-                report.append(f"- Gap between pass@8 and avg@8: {variance_ratio:.1f}%")
+                report.append(f"- Gap between pass@5 and avg@5: {variance_ratio:.1f}%")
                 report.append(f"- This indicates {'high' if variance_ratio > 50 else 'moderate' if variance_ratio > 20 else 'low'} variance in response quality")
     
     # Save report
@@ -920,7 +920,7 @@ def main():
     else:
         # Default: Use the default test-time compute configuration
         approaches_config = DEFAULT_TEST_TIME_COMPUTE
-        logger.info("Using default test-time compute evaluation (avg@8, pass@8, maj@8, genselect@8)")
+        logger.info("Using default test-time compute evaluation (avg@5, pass@5, maj@5, genselect@5)")
     
     # Store all metrics for final report
     all_metrics = {}

From 351411bee65eb02ce909b6573f3b61759dddd767 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Wed, 23 Jul 2025 20:14:10 +0800
Subject: [PATCH 08/10] Unify temperature and simplify majority voting logic

Replaced category-specific temperature settings with a unified temperature of 0.6 across majority voting and evaluation scripts for consistency. Simplified answer extraction and majority voting logic in the plugin to match evaluation script, removing quality scoring and normalization heuristics.
---
 optillm/plugins/majority_voting_plugin.py | 356 +++++-----------------
 scripts/eval_optillmbench.py              |   8 +-
 2 files changed, 79 insertions(+), 285 deletions(-)

diff --git a/optillm/plugins/majority_voting_plugin.py b/optillm/plugins/majority_voting_plugin.py
index d1faaf2d..2f05ce32 100644
--- a/optillm/plugins/majority_voting_plugin.py
+++ b/optillm/plugins/majority_voting_plugin.py
@@ -14,7 +14,6 @@
 from typing import Tuple, Dict, Any, List, Optional
 from collections import Counter
 import json
-from fractions import Fraction
 
 logger = logging.getLogger(__name__)
 
@@ -23,16 +22,7 @@
 
 # Default configuration
 DEFAULT_K = 8
-DEFAULT_TEMPERATURE = 0.3  # Lower for better consistency
-
-# Category-specific temperatures
-CATEGORY_TEMPERATURES = {
-    "gsm8k": 0.2,      # Math needs precision
-    "mmlu_math": 0.3,  # Multiple choice math
-    "boolq": 0.3,      # Boolean questions
-    "aqua_rat": 0.3,   # Reasoning with choices
-    "default": 0.3     # General default
-}
+DEFAULT_TEMPERATURE = 0.6  # Unified temperature for consistency
 
 def detect_category(query: str) -> str:
     """
@@ -62,245 +52,61 @@ def detect_category(query: str) -> str:
     return "default"
 
 
-def extract_answer_by_category(text: str, category: str) -> Optional[str]:
+
+
+def extract_answer_simple(response: str, category: str) -> Optional[str]:
     """
-    Extract answer based on problem category.
-    
-    Args:
-        text: Response text
-        category: Problem category
-        
-    Returns:
-        Extracted answer or None
+    Extract answer using same logic as evaluation script for consistency.
     """
-    text = text.strip()
+    if not response:
+        return None
+    
+    # Remove thinking blocks if present
+    response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
     
     if category == "gsm8k":
-        # Look for ### pattern specifically
-        match = re.search(r'###\s*(-?\d*\.?\d+)', text)
-        if match:
-            return match.group(1)
-        
-        # Fallback: look for "answer is" pattern with number
-        match = re.search(r'answer\s+is\s*:?\s*\$?(-?\d*\.?\d+)', text, re.IGNORECASE)
+        # Extract number after ###
+        match = re.search(r'###\s*(-?\d*\.?\d+)', response)
         if match:
             return match.group(1)
-            
-    elif category == "mmlu_math":
-        # Look for letter choices first
-        patterns = [
-            r'\b([A-E])\b(?:\s*\)|:|\.)?(?:\s|$)',  # Letter with optional punctuation
-            r'(?:answer|choice|option)\s*(?:is\s*)?:?\s*([A-E])\b',
-            r'^([A-E])$',  # Just a letter
-            r'\b([0-3])\b(?:\s*\)|:|\.)?(?:\s|$)',  # Index (0-3)
-        ]
-        
-        for pattern in patterns:
-            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
-            if match:
-                return match.group(1)
-                
-    elif category == "boolq":
-        # Extract boolean answers
-        text_lower = text.lower()
-        
-        # Direct true/false
-        if re.search(r'\b(true|false)\b', text_lower):
-            match = re.search(r'\b(true|false)\b', text_lower)
-            return match.group(1)
-        
-        # Yes/no
-        if re.search(r'\b(yes|no)\b', text_lower):
-            match = re.search(r'\b(yes|no)\b', text_lower)
-            return match.group(1)
-            
+    
     elif category == "aqua_rat":
-        # Similar to MMLU but may have more complex patterns
+        # For AQUA-RAT, be more flexible in extraction
+        response_upper = response.upper()
+        
+        # Try to find letter choices (A-E)
         patterns = [
-            r'(?:answer|option)\s*(?:is\s*)?:?\s*\(?([A-E])\)?',
-            r'\b([A-E])\s*\)',
-            r'^([A-E])$',
+            r'\b([A-E])\b(?!\w)',  # Single letter not part of word
+            r'(?:answer|choice|option)\s*:?\s*([A-E])\b',
+            r'\(([A-E])\)',  # Letter in parentheses
+            r'^([A-E])$',  # Just the letter
         ]
         
         for pattern in patterns:
-            match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
+            match = re.search(pattern, response_upper, re.IGNORECASE | re.MULTILINE)
             if match:
                 return match.group(1)
-    
-    # If category-specific extraction fails, fall back to generic
-    return extract_answer(text)
-
-
-def extract_answer(text: str) -> Optional[str]:
-    """
-    Generic answer extraction fallback.
-    Enhanced from original version.
-    """
-    text = text.strip()
-    
-    # LaTeX boxed format
-    boxed_match = re.search(r'\\{1,2}boxed\{([^}]+)\}', text)
-    if boxed_match:
-        return boxed_match.group(1).strip()
-    
-    # Answer patterns
-    answer_patterns = [
-        r'(?:final\s+)?answer\s*[:=]\s*(.+?)(?:\n|$)',
-        r'(?:the\s+)?(?:final\s+)?answer\s+is\s*[:=]?\s*(.+?)(?:\n|$)',
-        r'(?:therefore|thus|so)\s*,?\s*(.+?)(?:\n|$)'
-    ]
-    
-    for pattern in answer_patterns:
-        match = re.search(pattern, text, re.IGNORECASE)
-        if match:
-            answer = match.group(1).strip().rstrip('.,;')
-            if answer:
-                return answer
-    
-    # Check last line if short
-    lines = text.split('\n')
-    if lines:
-        last_line = lines[-1].strip()
-        if last_line and len(last_line) < 50 and not last_line.endswith(':'):
-            return last_line
-    
-    return None
-
-
-def normalize_answer_enhanced(answer: str, category: str = "default") -> str:
-    """
-    Enhanced answer normalization with category awareness.
-    
-    Args:
-        answer: Raw answer text
-        category: Problem category for specific normalization
-        
-    Returns:
-        Normalized answer
-    """
-    if not answer:
-        return ""
         
-    # Basic normalization
-    answer = answer.lower().strip()
-    answer = answer.strip('"\'')
-    answer = ' '.join(answer.split())
-    
-    # Category-specific normalization
-    if category in ["gsm8k", "mmlu_math"] and re.match(r'^-?\d*\.?\d+$', answer):
-        # Numeric normalization
-        try:
-            # Handle different number formats
-            answer = answer.replace(',', '')  # Remove commas
-            
-            # Convert to float for consistent representation
-            num = float(answer)
-            
-            # Handle integers
-            if num.is_integer():
-                return str(int(num))
-            else:
-                # Format to remove trailing zeros
-                return f"{num:g}"
-                
-        except ValueError:
-            pass
+        # If no letter found, check for common wrong patterns
+        # Map true/false/yes/no/numbers to letters (this is a heuristic)
+        if re.search(r'\b(true|yes|1)\b', response.lower()):
+            return "A"  # Default mapping
+        elif re.search(r'\b(false|no|0)\b', response.lower()):
+            return "B"  # Default mapping
     
-    elif category == "mmlu_math":
-        # Ensure single letter answers are uppercase
-        if len(answer) == 1 and answer.isalpha():
-            return answer.upper()
-        
-        # Extract letter from "option A", "choice B", etc.
-        match = re.match(r'(?:option|choice|answer)\s*([a-e])', answer, re.IGNORECASE)
-        if match:
-            return match.group(1).upper()
-            
     elif category == "boolq":
-        # Boolean normalization
-        true_values = ['yes', 'true', 'correct', '1', 't', 'y']
-        false_values = ['no', 'false', 'incorrect', '0', 'f', 'n']
-        
-        if answer in true_values:
-            return 'true'
-        elif answer in false_values:
-            return 'false'
-    
-    # Handle mathematical expressions
-    if category in ["gsm8k", "mmlu_math"]:
-        # Try to evaluate simple fractions
-        fraction_match = re.match(r'^(\d+)/(\d+)$', answer)
-        if fraction_match:
-            try:
-                frac = Fraction(int(fraction_match.group(1)), int(fraction_match.group(2)))
-                return str(float(frac))
-            except:
-                pass
-        
-        # Handle percentages
-        percent_match = re.match(r'^(\d*\.?\d+)%$', answer)
-        if percent_match:
-            try:
-                return str(float(percent_match.group(1)) / 100)
-            except:
-                pass
-    
-    return answer
-
-
-def score_response_quality(response: str, category: str) -> float:
-    """
-    Score response quality for weighted voting.
-    
-    Returns:
-        Quality score between 0 and 1
-    """
-    if not response:
-        return 0.0
-    
-    score = 1.0
-    
-    # Check for completeness
-    if len(response.strip()) < 10:
-        score *= 0.5
+        response_lower = response.lower()
+        if 'yes' in response_lower:
+            return 'yes'
+        elif 'no' in response_lower:
+            return 'no'
     
-    # Check for uncertainty markers
-    uncertainty_words = ['maybe', 'probably', 'might', 'could be', 'not sure', 'guess']
-    for word in uncertainty_words:
-        if word in response.lower():
-            score *= 0.7
-    
-    # Category-specific checks
-    if category == "gsm8k":
-        # Should have ### marker
-        if "###" not in response:
-            score *= 0.8
-    elif category in ["mmlu_math", "aqua_rat"]:
-        # Should have clear choice indication
-        if not re.search(r'\b[A-E]\b', response):
-            score *= 0.8
-    
-    # Check if response seems cut off
-    if response.strip().endswith(('...', 'Therefore,', 'So,', 'Thus,')):
-        score *= 0.5
-    
-    return score
-
-
-def add_self_consistency_prompt(system_prompt: str, query: str, category: str) -> str:
-    """
-    Add format instructions to encourage consistency.
-    """
-    format_instructions = {
-        "gsm8k": "\n\nIMPORTANT: After showing your work, provide your final numerical answer after ### on a new line.",
-        "mmlu_math": "\n\nIMPORTANT: After your reasoning, clearly state your choice as a single letter (A, B, C, D, or E).",
-        "boolq": "\n\nIMPORTANT: After your analysis, clearly state your answer as either 'true' or 'false'.",
-        "aqua_rat": "\n\nIMPORTANT: After solving the problem, clearly indicate your choice with the letter (A, B, C, D, or E).",
-        "default": "\n\nIMPORTANT: Clearly state your final answer at the end of your response."
-    }
+    elif category == "mmlu_math":
+        # For MMLU, just return the cleaned response
+        return response.strip()
     
-    instruction = format_instructions.get(category, format_instructions["default"])
-    return system_prompt + instruction
+    # Default: return cleaned response
+    return response.strip()
 
 
 def run(
@@ -311,9 +117,9 @@ def run(
     request_config: Dict[str, Any] = None
 ) -> Tuple[str, int]:
     """
-    Enhanced majority voting with category awareness and better extraction.
+    Simplified majority voting using consistent evaluation logic.
     """
-    logger.info("Starting enhanced majority voting process")
+    logger.info("Starting majority voting process")
     
     # Detect category
     category = detect_category(initial_query)
@@ -321,21 +127,14 @@ def run(
     
     # Extract parameters
     k = request_config.get('k', DEFAULT_K) if request_config else DEFAULT_K
-    
-    # Use category-specific temperature
-    base_temperature = CATEGORY_TEMPERATURES.get(category, DEFAULT_TEMPERATURE)
-    temperature = request_config.get('temperature', base_temperature) if request_config else base_temperature
-    
+    temperature = request_config.get('temperature', DEFAULT_TEMPERATURE) if request_config else DEFAULT_TEMPERATURE
     max_tokens = request_config.get('max_tokens', 4096) if request_config else 4096
     
     logger.info(f"Generating {k} candidates with temperature={temperature} for category={category}")
     
-    # Add self-consistency prompt
-    enhanced_system_prompt = add_self_consistency_prompt(system_prompt, initial_query, category)
-    
     # Prepare messages
     messages = [
-        {"role": "system", "content": enhanced_system_prompt},
+        {"role": "system", "content": system_prompt},
         {"role": "user", "content": initial_query}
     ]
     
@@ -376,47 +175,42 @@ def run(
     if not candidates:
         return "Error: Could not generate any candidates", 0
     
-    # Extract and normalize answers with quality scores
-    answer_data = []  # List of (normalized_answer, raw_answer, response, quality_score)
+    # Extract answers and count votes
+    answer_votes = Counter()
+    answer_to_responses = {}
     
     for i, candidate in enumerate(candidates):
-        # Extract answer using category-aware extraction
-        answer = extract_answer_by_category(candidate, category)
-        
+        answer = extract_answer_simple(candidate, category)
         if answer:
-            normalized = normalize_answer_enhanced(answer, category)
-            quality = score_response_quality(candidate, category)
-            answer_data.append((normalized, answer, candidate, quality))
-            logger.debug(f"Candidate {i+1}: {answer} -> {normalized} (quality: {quality:.2f})")
+            # Normalize answer for voting
+            if category == "aqua_rat":
+                answer = answer.upper()  # Ensure letters are uppercase
+            elif category == "boolq":
+                answer = answer.lower()  # Ensure yes/no are lowercase
+            elif category == "gsm8k":
+                # Try to normalize numbers
+                try:
+                    answer = str(float(answer))
+                except:
+                    pass
+            
+            answer_votes[answer] += 1
+            if answer not in answer_to_responses:
+                answer_to_responses[answer] = []
+            answer_to_responses[answer].append(candidate)
+            logger.debug(f"Candidate {i+1}: extracted '{answer}'")
         else:
             logger.warning(f"Could not extract answer from candidate {i+1}")
     
-    if not answer_data:
-        # Fallback: return highest quality response
-        quality_scores = [(score_response_quality(c, category), c) for c in candidates]
-        quality_scores.sort(reverse=True)
-        return quality_scores[0][1], total_tokens
-    
-    # Count weighted votes
-    weighted_votes = Counter()
-    answer_to_response = {}
-    
-    for normalized, raw, response, quality in answer_data:
-        weighted_votes[normalized] += quality
-        # Keep the highest quality response for each answer
-        if normalized not in answer_to_response or quality > answer_to_response[normalized][1]:
-            answer_to_response[normalized] = (response, quality)
-    
-    # Get the answer with highest weighted votes
-    most_common_answer, weighted_score = weighted_votes.most_common(1)[0]
-    
-    # Calculate confidence
-    total_weight = sum(weighted_votes.values())
-    confidence = weighted_score / total_weight if total_weight > 0 else 0
-    
-    logger.info(f"Most common answer: '{most_common_answer}' with weighted score {weighted_score:.2f} ({confidence:.1%} confidence)")
-    
-    # Return the best response for the winning answer
-    winning_response = answer_to_response[most_common_answer][0]
-    
-    return winning_response, total_tokens
\ No newline at end of file
+    # Select the most voted answer
+    if answer_votes:
+        most_common_answer, count = answer_votes.most_common(1)[0]
+        logger.info(f"Most common answer: '{most_common_answer}' with {count}/{k} votes")
+        
+        # Return the first response that gave this answer
+        winning_responses = answer_to_responses[most_common_answer]
+        return winning_responses[0], total_tokens
+    else:
+        # If no answers could be extracted, return the first candidate
+        logger.warning("No answers could be extracted, returning first candidate")
+        return candidates[0], total_tokens
\ No newline at end of file
diff --git a/scripts/eval_optillmbench.py b/scripts/eval_optillmbench.py
index e1073204..eb84a806 100644
--- a/scripts/eval_optillmbench.py
+++ b/scripts/eval_optillmbench.py
@@ -382,7 +382,7 @@ def evaluate_model(
                             {"role": "user", "content": prompt}
                         ],
                         n=n_param,
-                        temperature=0.7,  # High temperature for diversity
+                        temperature=0.6,  # Unified temperature for all approaches
                         max_tokens=4096,
                         extra_body=extra_body,
                     )
@@ -404,7 +404,7 @@ def evaluate_model(
                                     {"role": "system", "content": "You are a helpful AI assistant focused on providing precise answers in the requested format."},
                                     {"role": "user", "content": prompt}
                                 ],
-                                temperature=0.7,
+                                temperature=0.6,
                                 max_tokens=4096,
                                 extra_body=extra_body,
                             )
@@ -528,7 +528,7 @@ def evaluate_model(
                             {"role": "system", "content": "You are a helpful AI assistant focused on providing precise answers in the requested format."},
                             {"role": "user", "content": prompt}
                         ],
-                        temperature=0.7,  # Higher temperature for pass@k diversity
+                        temperature=0.6,  # Unified temperature for all approaches
                         max_tokens=4096,
                         extra_body=extra_body,
                     )
@@ -610,7 +610,7 @@ def evaluate_model(
                         {"role": "system", "content": "You are a helpful AI assistant focused on providing precise answers in the requested format."},
                         {"role": "user", "content": prompt}
                     ],
-                    temperature=0.2,
+                    temperature=0.6,
                     max_tokens=4096,
                     extra_body=extra_body,
                 )

From c5ca7959db9ab64ed4357a0fcc6cb6c0ab195760 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Wed, 23 Jul 2025 22:09:35 +0800
Subject: [PATCH 09/10] Update majority_voting_plugin.py

---
 optillm/plugins/majority_voting_plugin.py | 164 ++++++++--------------
 1 file changed, 60 insertions(+), 104 deletions(-)

diff --git a/optillm/plugins/majority_voting_plugin.py b/optillm/plugins/majority_voting_plugin.py
index 2f05ce32..b7ee484b 100644
--- a/optillm/plugins/majority_voting_plugin.py
+++ b/optillm/plugins/majority_voting_plugin.py
@@ -1,19 +1,14 @@
 """
-Majority Voting Plugin V2 for OptILLM
+Majority Voting Plugin for OptILLM
 
-Enhanced version with:
-- Category-aware answer extraction
-- Adaptive temperature control
-- Improved answer normalization
-- Response quality filtering
-- Smart fallback strategies
+Generic implementation that generates multiple candidates and selects
+the most common response through simple voting.
 """
 
 import re
 import logging
 from typing import Tuple, Dict, Any, List, Optional
 from collections import Counter
-import json
 
 logger = logging.getLogger(__name__)
 
@@ -24,89 +19,58 @@
 DEFAULT_K = 8
 DEFAULT_TEMPERATURE = 0.6  # Unified temperature for consistency
 
-def detect_category(query: str) -> str:
+
+def normalize_response(response: str) -> str:
     """
-    Try to detect the problem category from the query.
-    
-    Returns:
-        Category string or 'default' if unknown
+    Basic normalization for comparing responses.
+    Removes extra whitespace, punctuation at ends, and lowercases.
     """
-    query_lower = query.lower()
+    if not response:
+        return ""
     
-    # GSM8K patterns
-    if "###" in query or ("calculate" in query_lower and any(word in query_lower for word in ["total", "sum", "difference", "product"])):
-        return "gsm8k"
+    # Remove thinking blocks if present
+    response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
     
-    # MMLU patterns (multiple choice)
-    if re.search(r'\b[A-E]\s*[:\)]\s*', query) or "which of the following" in query_lower:
-        return "mmlu_math"
+    # Basic normalization
+    response = response.strip()
+    response = response.lower()
     
-    # BoolQ patterns
-    if query_lower.strip().endswith("?") and any(word in query_lower for word in ["is", "are", "was", "were", "does", "do", "did", "can", "could", "will", "would"]):
-        return "boolq"
+    # Remove trailing punctuation
+    response = response.rstrip('.,;:!?')
     
-    # AQUA-RAT patterns
-    if re.search(r'options?:\s*[A-E]', query, re.IGNORECASE):
-        return "aqua_rat"
+    # Normalize whitespace
+    response = ' '.join(response.split())
     
-    return "default"
+    return response
 
 
-
-
-def extract_answer_simple(response: str, category: str) -> Optional[str]:
+def extract_final_answer(response: str) -> str:
     """
-    Extract answer using same logic as evaluation script for consistency.
+    Try to extract just the final answer from a response.
+    This is generic and looks for common patterns.
     """
     if not response:
-        return None
+        return response
     
-    # Remove thinking blocks if present
+    # Remove thinking blocks
     response = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
     
-    if category == "gsm8k":
-        # Extract number after ###
-        match = re.search(r'###\s*(-?\d*\.?\d+)', response)
+    # Look for common answer patterns
+    patterns = [
+        r'(?:final answer|answer):\s*(.+?)(?:\n|$)',
+        r'(?:the answer is|answer is)\s*(.+?)(?:\n|$)',
+        r'###\s*(.+?)(?:\n|$)',  # Common in math problems
+        r'^([A-E])\b',  # Single letter at start
+        r'\b([A-E])\b\s*$',  # Single letter at end
+    ]
+    
+    for pattern in patterns:
+        match = re.search(pattern, response, re.IGNORECASE | re.MULTILINE)
         if match:
-            return match.group(1)
+            return match.group(1).strip()
     
-    elif category == "aqua_rat":
-        # For AQUA-RAT, be more flexible in extraction
-        response_upper = response.upper()
-        
-        # Try to find letter choices (A-E)
-        patterns = [
-            r'\b([A-E])\b(?!\w)',  # Single letter not part of word
-            r'(?:answer|choice|option)\s*:?\s*([A-E])\b',
-            r'\(([A-E])\)',  # Letter in parentheses
-            r'^([A-E])$',  # Just the letter
-        ]
-        
-        for pattern in patterns:
-            match = re.search(pattern, response_upper, re.IGNORECASE | re.MULTILINE)
-            if match:
-                return match.group(1)
-        
-        # If no letter found, check for common wrong patterns
-        # Map true/false/yes/no/numbers to letters (this is a heuristic)
-        if re.search(r'\b(true|yes|1)\b', response.lower()):
-            return "A"  # Default mapping
-        elif re.search(r'\b(false|no|0)\b', response.lower()):
-            return "B"  # Default mapping
-    
-    elif category == "boolq":
-        response_lower = response.lower()
-        if 'yes' in response_lower:
-            return 'yes'
-        elif 'no' in response_lower:
-            return 'no'
-    
-    elif category == "mmlu_math":
-        # For MMLU, just return the cleaned response
-        return response.strip()
-    
-    # Default: return cleaned response
-    return response.strip()
+    # If no pattern found, return the whole response
+    return response
 
 
 def run(
@@ -117,20 +81,16 @@ def run(
     request_config: Dict[str, Any] = None
 ) -> Tuple[str, int]:
     """
-    Simplified majority voting using consistent evaluation logic.
+    Generic majority voting implementation.
     """
     logger.info("Starting majority voting process")
     
-    # Detect category
-    category = detect_category(initial_query)
-    logger.info(f"Detected category: {category}")
-    
     # Extract parameters
     k = request_config.get('k', DEFAULT_K) if request_config else DEFAULT_K
     temperature = request_config.get('temperature', DEFAULT_TEMPERATURE) if request_config else DEFAULT_TEMPERATURE
     max_tokens = request_config.get('max_tokens', 4096) if request_config else 4096
     
-    logger.info(f"Generating {k} candidates with temperature={temperature} for category={category}")
+    logger.info(f"Generating {k} candidates with temperature={temperature}")
     
     # Prepare messages
     messages = [
@@ -175,40 +135,36 @@ def run(
     if not candidates:
         return "Error: Could not generate any candidates", 0
     
-    # Extract answers and count votes
+    # Extract and normalize answers for voting
     answer_votes = Counter()
     answer_to_responses = {}
     
     for i, candidate in enumerate(candidates):
-        answer = extract_answer_simple(candidate, category)
-        if answer:
-            # Normalize answer for voting
-            if category == "aqua_rat":
-                answer = answer.upper()  # Ensure letters are uppercase
-            elif category == "boolq":
-                answer = answer.lower()  # Ensure yes/no are lowercase
-            elif category == "gsm8k":
-                # Try to normalize numbers
-                try:
-                    answer = str(float(answer))
-                except:
-                    pass
+        # Try to extract just the answer part
+        answer = extract_final_answer(candidate)
+        
+        # Normalize for comparison
+        normalized = normalize_response(answer)
+        
+        if normalized:
+            answer_votes[normalized] += 1
+            
+            # Keep track of original responses for each normalized answer
+            if normalized not in answer_to_responses:
+                answer_to_responses[normalized] = []
+            answer_to_responses[normalized].append(candidate)
             
-            answer_votes[answer] += 1
-            if answer not in answer_to_responses:
-                answer_to_responses[answer] = []
-            answer_to_responses[answer].append(candidate)
-            logger.debug(f"Candidate {i+1}: extracted '{answer}'")
+            logger.debug(f"Candidate {i+1}: '{answer}' -> '{normalized}'")
         else:
-            logger.warning(f"Could not extract answer from candidate {i+1}")
+            logger.warning(f"Could not extract/normalize answer from candidate {i+1}")
     
     # Select the most voted answer
     if answer_votes:
-        most_common_answer, count = answer_votes.most_common(1)[0]
-        logger.info(f"Most common answer: '{most_common_answer}' with {count}/{k} votes")
+        most_common_normalized, count = answer_votes.most_common(1)[0]
+        logger.info(f"Most common answer: '{most_common_normalized}' with {count}/{k} votes")
         
-        # Return the first response that gave this answer
-        winning_responses = answer_to_responses[most_common_answer]
+        # Return the first original response that mapped to this answer
+        winning_responses = answer_to_responses[most_common_normalized]
         return winning_responses[0], total_tokens
     else:
         # If no answers could be extracted, return the first candidate

From 7980b4671f4917504966f965a3b41b96979efb04 Mon Sep 17 00:00:00 2001
From: Asankhaya Sharma <codelion@users.noreply.github.com>
Date: Thu, 24 Jul 2025 10:33:13 +0800
Subject: [PATCH 10/10] Update test.yml

---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1e967441..62fc70a8 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.10', '3.11', '3.12']
+        python-version: ['3.12']
     
     steps:
     - uses: actions/checkout@v4
@@ -59,7 +59,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
-        python-version: '3.11'
+        python-version: '3.12'
     
     - name: Install dependencies
       run: |