diff --git a/.gitignore b/.gitignore index 991806bf..70e8202d 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,4 @@ cython_debug/ .vscode/ scripts/results/ +results/ diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..63bb9902 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,133 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +OptILLM is an OpenAI API compatible optimizing inference proxy that implements state-of-the-art techniques to improve accuracy and performance of LLMs. It focuses on reasoning improvements for coding, logical, and mathematical queries through inference-time compute optimization. + +## Core Architecture + +### Main Components + +1. **Entry Points**: + - `optillm.py` - Main Flask server with inference routing + - `optillm/inference.py` - Local inference engine with transformer models + - Setup via `setup.py` with console script `optillm=optillm:main` + +2. **Optimization Techniques** (`optillm/`): + - **Reasoning**: `cot_reflection.py`, `plansearch.py`, `leap.py`, `reread.py` + - **Sampling**: `bon.py` (Best of N), `moa.py` (Mixture of Agents), `self_consistency.py` + - **Search**: `mcts.py` (Monte Carlo Tree Search), `rstar.py` (R* Algorithm) + - **Verification**: `pvg.py` (Prover-Verifier Game), `z3_solver.py` + - **Advanced**: `cepo/` (Cerebras Planning & Optimization), `rto.py` (Round Trip) + +3. **Decoding Techniques**: + - `cot_decoding.py` - Chain-of-thought without explicit prompting + - `entropy_decoding.py` - Adaptive sampling based on token uncertainty + - `thinkdeeper.py` - Reasoning effort scaling + - `autothink/` - Query complexity classification with steering vectors + +4. **Plugin System** (`optillm/plugins/`): + - `spl/` - System Prompt Learning (third paradigm learning) + - `deepthink/` - Gemini-like deep thinking with inference scaling + - `longcepo/` - Long-context processing with divide-and-conquer + - `mcp_plugin.py` - Model Context Protocol client + - `memory_plugin.py` - Short-term memory for unbounded context + - `privacy_plugin.py` - PII anonymization/deanonymization + - `executecode_plugin.py` - Code interpreter integration + - `json_plugin.py` - Structured outputs with outlines library + +## Development Commands + +### Installation & Setup +```bash +# Development setup +python3 -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt + +# Package installation +pip install optillm +``` + +### Running the Server +```bash +# Basic server (auto approach detection) +python optillm.py + +# With specific approach +python optillm.py --approach moa --model gpt-4o-mini + +# With external endpoint +python optillm.py --base_url http://localhost:8080/v1 + +# Docker +docker compose up -d +``` + +### Testing +```bash +# Run all approach tests +python test.py + +# Test specific approaches +python test.py --approaches moa bon mcts + +# Test with specific model/endpoint +python test.py --model gpt-4o-mini --base-url http://localhost:8080/v1 + +# Single test case +python test.py --single-test "specific_test_name" +``` + +### Evaluation Scripts +```bash +# Math benchmark evaluation +python scripts/eval_math500_benchmark.py + +# AIME benchmark +python scripts/eval_aime_benchmark.py + +# Arena Hard Auto evaluation +python scripts/eval_arena_hard_auto_rtc.py + +# FRAMES benchmark +python scripts/eval_frames_benchmark.py + +# OptILLM benchmark generation/evaluation +python scripts/gen_optillmbench.py +python scripts/eval_optillmbench.py +``` + +## Usage Patterns + +### Approach Selection (Priority Order) +1. **Model prefix**: `moa-gpt-4o-mini` (approach slug + model name) +2. **extra_body field**: `{"optillm_approach": "bon|moa|mcts"}` +3. **Prompt tags**: `re2` in system/user prompt + +### Approach Combinations +- **Pipeline** (`&`): `cot_reflection&moa` - sequential processing +- **Parallel** (`|`): `bon|moa|mcts` - multiple responses returned as list + +### Local Inference +- Set `OPTILLM_API_KEY=optillm` to enable built-in transformer inference +- Supports HuggingFace models with LoRA adapters: `model+lora1+lora2` +- Advanced decoding: `{"decoding": "cot_decoding", "k": 10}` + +### Plugin Configuration +- MCP: `~/.optillm/mcp_config.json` for Model Context Protocol servers +- SPL: Built-in system prompt learning for solving strategies +- Memory: Automatic unbounded context via chunking and retrieval + +## Key Concepts + +### Inference Optimization +The proxy intercepts OpenAI API calls and applies optimization techniques before forwarding to LLM providers (OpenAI, Cerebras, Azure, LiteLLM). Each technique implements specific reasoning or sampling improvements. + +### Plugin Architecture +Plugins extend functionality via standardized interfaces. They can modify requests, process responses, add tools, or provide entirely new capabilities like code execution or structured outputs. + +### Multi-Provider Support +Automatically detects and routes to appropriate LLM provider based on environment variables (`OPENAI_API_KEY`, `CEREBRAS_API_KEY`, etc.) with fallback to LiteLLM for broader model support. \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index 6cae0ef9..e50b514b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1,3 @@ include optillm/plugins/*.py +include optillm/cepo/*.py +include optillm/cepo/configs/*.yaml diff --git a/optillm/__init__.py b/optillm/__init__.py index 24870a61..b94bfb95 100644 --- a/optillm/__init__.py +++ b/optillm/__init__.py @@ -2,7 +2,7 @@ import os # Version information -__version__ = "0.1.20" +__version__ = "0.1.21" # Get the path to the root optillm.py spec = util.spec_from_file_location( diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..c4055c22 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,71 @@ +[build-system] +requires = ["setuptools>=64", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "optillm" +version = "0.1.21" +description = "An optimizing inference proxy for LLMs." +readme = "README.md" +license = "Apache-2.0" +authors = [ + {name = "codelion", email = "codelion@okyasoft.com"} +] +requires-python = ">=3.10" +classifiers = [ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", +] +dependencies = [ + "numpy", + "networkx", + "openai", + "z3-solver", + "aiohttp", + "flask", + "torch", + "transformers", + "azure-identity", + "tiktoken", + "scikit-learn", + "litellm", + "requests", + "beautifulsoup4", + "lxml", + "presidio_analyzer", + "presidio_anonymizer", + "nbconvert", + "nbformat", + "ipython", + "ipykernel", + "peft", + "bitsandbytes", + "gradio<5.16.0", + # Constrain spacy version to avoid blis build issues on ARM64 + "spacy<3.8.0", + "cerebras_cloud_sdk", + "outlines[transformers]", + "sentencepiece", + "mcp", + "adaptive-classifier", + # MLX support for Apple Silicon optimization + 'mlx-lm>=0.24.0; platform_machine=="arm64" and sys_platform=="darwin"', +] + +[project.urls] +Homepage = "https://github.com/codelion/optillm" +Repository = "https://github.com/codelion/optillm" +Issues = "https://github.com/codelion/optillm/issues" + +[project.scripts] +optillm = "optillm:main" + +[tool.setuptools.packages.find] +include = ["optillm*"] + +[tool.setuptools.package-data] +optillm = [ + "plugins/*.py", + "cepo/*.py", + "cepo/configs/*.yaml", +] \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 73a48e32..00000000 --- a/setup.py +++ /dev/null @@ -1,69 +0,0 @@ -import os -from setuptools import setup, find_packages - -setup( - name="optillm", - version="0.1.20", - packages=find_packages(include=['optillm', 'optillm.*']), # This ensures all subpackages are included - py_modules=['optillm'], - package_data={ - 'optillm': [ - 'plugins/*.py', # Include plugin files - 'cepo/*.py', # Include cepo module Python files - 'cepo/configs/*.yaml', # Include yaml files - ], - }, - include_package_data=True, # This is important - install_requires=[ - "numpy", - "networkx", - "openai", - "z3-solver", - "aiohttp", - "flask", - "torch", - "transformers", - "azure-identity", - "tiktoken", - "scikit-learn", - "litellm", - "requests", - "beautifulsoup4", - "lxml", - "presidio_analyzer", - "presidio_anonymizer", - "nbconvert", - "nbformat", - "ipython", - "ipykernel", - "peft", - "bitsandbytes", - "gradio<5.16.0", - # Constrain spacy version to avoid blis build issues on ARM64 - "spacy<3.8.0", - "cerebras_cloud_sdk", - "outlines[transformers]", - "sentencepiece", - "mcp", - "adaptive-classifier", - # MLX support for Apple Silicon optimization - 'mlx-lm>=0.24.0; platform_machine=="arm64" and sys_platform=="darwin"', - ], - entry_points={ - 'console_scripts': [ - 'optillm=optillm:main', # Points directly to the main function in optillm.py - ], - }, - author="codelion", - author_email="codelion@okyasoft.com", - description="An optimizing inference proxy for LLMs.", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - url="https://github.com/codelion/optillm", - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - ], - python_requires=">=3.10", -)