diff --git a/.gitignore b/.gitignore
index 991806bf..70e8202d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -169,3 +169,4 @@ cython_debug/
 .vscode/
 
 scripts/results/
+results/
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 00000000..63bb9902
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,133 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+OptILLM is an OpenAI API compatible optimizing inference proxy that implements state-of-the-art techniques to improve accuracy and performance of LLMs. It focuses on reasoning improvements for coding, logical, and mathematical queries through inference-time compute optimization.
+
+## Core Architecture
+
+### Main Components
+
+1. **Entry Points**: 
+   - `optillm.py` - Main Flask server with inference routing
+   - `optillm/inference.py` - Local inference engine with transformer models
+   - Setup via `setup.py` with console script `optillm=optillm:main`
+
+2. **Optimization Techniques** (`optillm/`):
+   - **Reasoning**: `cot_reflection.py`, `plansearch.py`, `leap.py`, `reread.py` 
+   - **Sampling**: `bon.py` (Best of N), `moa.py` (Mixture of Agents), `self_consistency.py`
+   - **Search**: `mcts.py` (Monte Carlo Tree Search), `rstar.py` (R* Algorithm)
+   - **Verification**: `pvg.py` (Prover-Verifier Game), `z3_solver.py`
+   - **Advanced**: `cepo/` (Cerebras Planning & Optimization), `rto.py` (Round Trip)
+
+3. **Decoding Techniques**:
+   - `cot_decoding.py` - Chain-of-thought without explicit prompting
+   - `entropy_decoding.py` - Adaptive sampling based on token uncertainty
+   - `thinkdeeper.py` - Reasoning effort scaling
+   - `autothink/` - Query complexity classification with steering vectors
+
+4. **Plugin System** (`optillm/plugins/`):
+   - `spl/` - System Prompt Learning (third paradigm learning)
+   - `deepthink/` - Gemini-like deep thinking with inference scaling
+   - `longcepo/` - Long-context processing with divide-and-conquer
+   - `mcp_plugin.py` - Model Context Protocol client
+   - `memory_plugin.py` - Short-term memory for unbounded context
+   - `privacy_plugin.py` - PII anonymization/deanonymization
+   - `executecode_plugin.py` - Code interpreter integration
+   - `json_plugin.py` - Structured outputs with outlines library
+
+## Development Commands
+
+### Installation & Setup
+```bash
+# Development setup
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+
+# Package installation
+pip install optillm
+```
+
+### Running the Server
+```bash
+# Basic server (auto approach detection)
+python optillm.py
+
+# With specific approach
+python optillm.py --approach moa --model gpt-4o-mini
+
+# With external endpoint
+python optillm.py --base_url http://localhost:8080/v1
+
+# Docker
+docker compose up -d
+```
+
+### Testing
+```bash
+# Run all approach tests
+python test.py
+
+# Test specific approaches
+python test.py --approaches moa bon mcts
+
+# Test with specific model/endpoint
+python test.py --model gpt-4o-mini --base-url http://localhost:8080/v1
+
+# Single test case
+python test.py --single-test "specific_test_name"
+```
+
+### Evaluation Scripts
+```bash
+# Math benchmark evaluation
+python scripts/eval_math500_benchmark.py
+
+# AIME benchmark
+python scripts/eval_aime_benchmark.py
+
+# Arena Hard Auto evaluation  
+python scripts/eval_arena_hard_auto_rtc.py
+
+# FRAMES benchmark
+python scripts/eval_frames_benchmark.py
+
+# OptILLM benchmark generation/evaluation
+python scripts/gen_optillmbench.py
+python scripts/eval_optillmbench.py
+```
+
+## Usage Patterns
+
+### Approach Selection (Priority Order)
+1. **Model prefix**: `moa-gpt-4o-mini` (approach slug + model name)
+2. **extra_body field**: `{"optillm_approach": "bon|moa|mcts"}`
+3. **Prompt tags**: `<optillm_approach>re2</optillm_approach>` in system/user prompt
+
+### Approach Combinations
+- **Pipeline** (`&`): `cot_reflection&moa` - sequential processing
+- **Parallel** (`|`): `bon|moa|mcts` - multiple responses returned as list
+
+### Local Inference
+- Set `OPTILLM_API_KEY=optillm` to enable built-in transformer inference
+- Supports HuggingFace models with LoRA adapters: `model+lora1+lora2`
+- Advanced decoding: `{"decoding": "cot_decoding", "k": 10}`
+
+### Plugin Configuration
+- MCP: `~/.optillm/mcp_config.json` for Model Context Protocol servers
+- SPL: Built-in system prompt learning for solving strategies
+- Memory: Automatic unbounded context via chunking and retrieval
+
+## Key Concepts
+
+### Inference Optimization
+The proxy intercepts OpenAI API calls and applies optimization techniques before forwarding to LLM providers (OpenAI, Cerebras, Azure, LiteLLM). Each technique implements specific reasoning or sampling improvements.
+
+### Plugin Architecture
+Plugins extend functionality via standardized interfaces. They can modify requests, process responses, add tools, or provide entirely new capabilities like code execution or structured outputs.
+
+### Multi-Provider Support
+Automatically detects and routes to appropriate LLM provider based on environment variables (`OPENAI_API_KEY`, `CEREBRAS_API_KEY`, etc.) with fallback to LiteLLM for broader model support.
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
index 6cae0ef9..e50b514b 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,3 @@
 include optillm/plugins/*.py
+include optillm/cepo/*.py
+include optillm/cepo/configs/*.yaml
diff --git a/optillm/__init__.py b/optillm/__init__.py
index 24870a61..b94bfb95 100644
--- a/optillm/__init__.py
+++ b/optillm/__init__.py
@@ -2,7 +2,7 @@
 import os
 
 # Version information
-__version__ = "0.1.20"
+__version__ = "0.1.21"
 
 # Get the path to the root optillm.py
 spec = util.spec_from_file_location(
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..c4055c22
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,71 @@
+[build-system]
+requires = ["setuptools>=64", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "optillm"
+version = "0.1.21"
+description = "An optimizing inference proxy for LLMs."
+readme = "README.md"
+license = "Apache-2.0"
+authors = [
+    {name = "codelion", email = "codelion@okyasoft.com"}
+]
+requires-python = ">=3.10"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+    "numpy",
+    "networkx",
+    "openai",
+    "z3-solver",
+    "aiohttp",
+    "flask",
+    "torch",
+    "transformers",
+    "azure-identity",
+    "tiktoken",
+    "scikit-learn",
+    "litellm",
+    "requests",
+    "beautifulsoup4",
+    "lxml",
+    "presidio_analyzer",
+    "presidio_anonymizer",
+    "nbconvert",
+    "nbformat",
+    "ipython",
+    "ipykernel",
+    "peft",
+    "bitsandbytes",
+    "gradio<5.16.0",
+    # Constrain spacy version to avoid blis build issues on ARM64
+    "spacy<3.8.0",
+    "cerebras_cloud_sdk",
+    "outlines[transformers]",
+    "sentencepiece",
+    "mcp",
+    "adaptive-classifier",
+    # MLX support for Apple Silicon optimization
+    'mlx-lm>=0.24.0; platform_machine=="arm64" and sys_platform=="darwin"',
+]
+
+[project.urls]
+Homepage = "https://github.com/codelion/optillm"
+Repository = "https://github.com/codelion/optillm"
+Issues = "https://github.com/codelion/optillm/issues"
+
+[project.scripts]
+optillm = "optillm:main"
+
+[tool.setuptools.packages.find]
+include = ["optillm*"]
+
+[tool.setuptools.package-data]
+optillm = [
+    "plugins/*.py",
+    "cepo/*.py",
+    "cepo/configs/*.yaml",
+]
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 73a48e32..00000000
--- a/setup.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import os
-from setuptools import setup, find_packages
-
-setup(
-    name="optillm",
-    version="0.1.20",
-    packages=find_packages(include=['optillm', 'optillm.*']),  # This ensures all subpackages are included
-    py_modules=['optillm'],
-    package_data={
-        'optillm': [
-            'plugins/*.py',  # Include plugin files
-            'cepo/*.py',     # Include cepo module Python files
-            'cepo/configs/*.yaml',  # Include yaml files
-        ],
-    },
-    include_package_data=True,  # This is important
-    install_requires=[
-        "numpy",
-        "networkx",
-        "openai",
-        "z3-solver",
-        "aiohttp",
-        "flask",
-        "torch",
-        "transformers",
-        "azure-identity",
-        "tiktoken",
-        "scikit-learn",
-        "litellm",
-        "requests",
-        "beautifulsoup4",
-        "lxml",
-        "presidio_analyzer",
-        "presidio_anonymizer",
-        "nbconvert",
-        "nbformat",
-        "ipython",
-        "ipykernel",
-        "peft",
-        "bitsandbytes",
-        "gradio<5.16.0",
-        # Constrain spacy version to avoid blis build issues on ARM64
-        "spacy<3.8.0",
-        "cerebras_cloud_sdk",
-        "outlines[transformers]",
-        "sentencepiece",
-        "mcp",
-        "adaptive-classifier",
-        # MLX support for Apple Silicon optimization
-        'mlx-lm>=0.24.0; platform_machine=="arm64" and sys_platform=="darwin"',
-    ],
-    entry_points={
-        'console_scripts': [
-            'optillm=optillm:main',  # Points directly to the main function in optillm.py
-        ],
-    },
-    author="codelion",
-    author_email="codelion@okyasoft.com",
-    description="An optimizing inference proxy for LLMs.",
-    long_description=open("README.md").read(),
-    long_description_content_type="text/markdown",
-    url="https://github.com/codelion/optillm",
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: Apache Software License",
-        "Operating System :: OS Independent",
-    ],
-    python_requires=">=3.10",
-)