Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 12 additions & 54 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,48 +79,26 @@ jobs:
type=semver,pattern={{major}}.{{minor}}
type=raw,value=latest

# Build and push proxy AMD64
- name: Build and push proxy_only Docker image AMD64
# Build and push proxy_only multi-arch
- name: Build and push proxy_only Docker image (multi-arch)
uses: docker/build-push-action@v5
with:
context: .
file: Dockerfile.proxy_only
push: true
platforms: linux/amd64
platforms: linux/amd64,linux/arm64
tags: ${{ steps.meta-proxy.outputs.tags }}
labels: ${{ steps.meta-proxy.outputs.labels }}
cache-from: type=gha,scope=proxy-amd64
cache-to: type=gha,scope=proxy-amd64,mode=max
cache-from: type=gha
cache-to: type=gha,mode=max
outputs: type=registry,compression=zstd,compression-level=5

# Cleanup after AMD64 build
- name: Cleanup after AMD64 build
# Cleanup after proxy build
- name: Cleanup after proxy build
run: |
docker system prune -af
docker builder prune -af
df -h

# Build proxy ARM64
- name: Build and push proxy_only Docker image ARM64
uses: docker/build-push-action@v5
with:
context: .
file: Dockerfile.proxy_only
push: true
platforms: linux/arm64
tags: ${{ steps.meta-proxy.outputs.tags }}
labels: ${{ steps.meta-proxy.outputs.labels }}
cache-from: type=gha,scope=proxy-arm64
cache-to: type=gha,scope=proxy-arm64,mode=max
outputs: type=registry,compression=zstd,compression-level=5

# Cleanup after proxy builds
- name: Cleanup after proxy builds
run: |
docker system prune -af
docker builder prune -af
find /tmp -type f -user $(id -u) -exec rm -f {} + 2>/dev/null || true
df -h

# Extract metadata for full image
- name: Extract metadata for Docker
Expand All @@ -133,35 +111,15 @@ jobs:
type=semver,pattern={{major}}.{{minor}}
latest

# Build full image AMD64
- name: Build and push Docker image AMD64
uses: docker/build-push-action@v5
with:
context: .
push: true
platforms: linux/amd64
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha,scope=full-amd64
cache-to: type=gha,scope=full-amd64,mode=max
outputs: type=registry,compression=zstd,compression-level=5

# Cleanup between architectures
- name: Cleanup between architectures
run: |
docker system prune -af
docker builder prune -af
df -h

# Build full image ARM64
- name: Build and push Docker image ARM64
# Build full image multi-arch
- name: Build and push Docker image (multi-arch)
uses: docker/build-push-action@v5
with:
context: .
push: true
platforms: linux/arm64
platforms: linux/amd64,linux/arm64
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha,scope=full-arm64
cache-to: type=gha,scope=full-arm64,mode=max
cache-from: type=gha
cache-to: type=gha,mode=max
outputs: type=registry,compression=zstd,compression-level=5
57 changes: 57 additions & 0 deletions optillm.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,52 @@ def get_config():
default_client = LiteLLMWrapper()
return default_client, API_KEY

def count_reasoning_tokens(text: str, tokenizer=None) -> int:
"""
Count tokens within <think>...</think> tags in the given text.

Args:
text: The text to analyze
tokenizer: Optional tokenizer instance for precise counting

Returns:
Number of reasoning tokens (0 if no think tags found)
"""
if not text or not isinstance(text, str):
return 0

# Extract all content within <think>...</think> tags
# Handle both complete and truncated think blocks

# First, find all complete <think>...</think> blocks
complete_pattern = r'<think>(.*?)</think>'
complete_matches = re.findall(complete_pattern, text, re.DOTALL)

# Then check for unclosed <think> tag (truncated response)
# This finds <think> that doesn't have a matching </think> after it
truncated_pattern = r'<think>(?!.*</think>)(.*)$'
truncated_match = re.search(truncated_pattern, text, re.DOTALL)

# Combine all thinking content
thinking_content = ''.join(complete_matches)
if truncated_match:
thinking_content += truncated_match.group(1)

if not thinking_content:
return 0

if tokenizer and hasattr(tokenizer, 'encode'):
# Use tokenizer for precise counting
try:
tokens = tokenizer.encode(thinking_content)
return len(tokens)
except Exception as e:
logger.warning(f"Failed to count tokens with tokenizer: {e}")

# Fallback: rough estimation (4 chars per token on average, minimum 1 token for non-empty content)
content_length = len(thinking_content.strip())
return max(1, content_length // 4) if content_length > 0 else 0

# Server configuration
server_config = {
'approach': 'none',
Expand Down Expand Up @@ -678,11 +724,22 @@ def proxy():
if stream:
return Response(generate_streaming_response(response, model), content_type='text/event-stream')
else:
# Calculate reasoning tokens from the response
reasoning_tokens = 0
if isinstance(response, str):
reasoning_tokens = count_reasoning_tokens(response)
elif isinstance(response, list) and response:
# For multiple responses, sum up reasoning tokens from all
reasoning_tokens = sum(count_reasoning_tokens(resp) for resp in response if isinstance(resp, str))

response_data = {
'model': model,
'choices': [],
'usage': {
'completion_tokens': completion_tokens,
'completion_tokens_details': {
'reasoning_tokens': reasoning_tokens
}
}
}

Expand Down
4 changes: 3 additions & 1 deletion optillm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os

# Version information
__version__ = "0.1.22"
__version__ = "0.1.26"

# Get the path to the root optillm.py
spec = util.spec_from_file_location(
Expand All @@ -27,6 +27,7 @@
extract_optillm_approach = module.extract_optillm_approach
get_config = module.get_config
load_plugins = module.load_plugins
count_reasoning_tokens = module.count_reasoning_tokens

# Export execution functions
execute_single_approach = module.execute_single_approach
Expand All @@ -48,6 +49,7 @@
'extract_optillm_approach',
'get_config',
'load_plugins',
'count_reasoning_tokens',
'execute_single_approach',
'execute_combined_approaches',
'execute_parallel_approaches',
Expand Down
67 changes: 62 additions & 5 deletions optillm/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import traceback
import platform
import sys
import re

from optillm.cot_decoding import cot_decode
from optillm.entropy_decoding import entropy_decode
Expand All @@ -29,6 +30,52 @@
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def count_reasoning_tokens(text: str, tokenizer=None) -> int:
"""
Count tokens within <think>...</think> tags in the given text.

Args:
text: The text to analyze
tokenizer: Optional tokenizer instance for precise counting

Returns:
Number of reasoning tokens (0 if no think tags found)
"""
if not text or not isinstance(text, str):
return 0

# Extract all content within <think>...</think> tags
# Handle both complete and truncated think blocks

# First, find all complete <think>...</think> blocks
complete_pattern = r'<think>(.*?)</think>'
complete_matches = re.findall(complete_pattern, text, re.DOTALL)

# Then check for unclosed <think> tag (truncated response)
# This finds <think> that doesn't have a matching </think> after it
truncated_pattern = r'<think>(?!.*</think>)(.*)$'
truncated_match = re.search(truncated_pattern, text, re.DOTALL)

# Combine all thinking content
thinking_content = ''.join(complete_matches)
if truncated_match:
thinking_content += truncated_match.group(1)

if not thinking_content:
return 0

if tokenizer and hasattr(tokenizer, 'encode'):
# Use tokenizer for precise counting
try:
tokens = tokenizer.encode(thinking_content)
return len(tokens)
except Exception as e:
logger.warning(f"Failed to count tokens with tokenizer: {e}")

# Fallback: rough estimation (4 chars per token on average, minimum 1 token for non-empty content)
content_length = len(thinking_content.strip())
return max(1, content_length // 4) if content_length > 0 else 0

# MLX Support for Apple Silicon
try:
import mlx.core as mx
Expand Down Expand Up @@ -1502,10 +1549,11 @@ def __init__(
self.message.logprobs = logprobs

class ChatCompletionUsage:
def __init__(self, prompt_tokens: int, completion_tokens: int, total_tokens: int):
def __init__(self, prompt_tokens: int, completion_tokens: int, total_tokens: int, reasoning_tokens: int = 0):
self.prompt_tokens = prompt_tokens
self.completion_tokens = completion_tokens
self.total_tokens = total_tokens
self.reasoning_tokens = reasoning_tokens

class ChatCompletion:
def __init__(self, response_dict: Dict):
Expand Down Expand Up @@ -1547,7 +1595,10 @@ def model_dump(self) -> Dict:
"usage": {
"prompt_tokens": self.usage.prompt_tokens,
"completion_tokens": self.usage.completion_tokens,
"total_tokens": self.usage.total_tokens
"total_tokens": self.usage.total_tokens,
"completion_tokens_details": {
"reasoning_tokens": getattr(self.usage, 'reasoning_tokens', 0)
}
}
}

Expand Down Expand Up @@ -1766,15 +1817,15 @@ def create(

logger.debug(f"ThinkDeeper tokens: user={user_max_tokens}, thinking={max_thinking_tokens}, adjusted={adjusted_max_tokens}")

result = thinkdeeper_decode_mlx(
result, reasoning_tokens = thinkdeeper_decode_mlx(
pipeline.model,
pipeline.tokenizer,
messages,
thinkdeeper_config_with_tokens
)
else:
logger.info("Using PyTorch ThinkDeeper implementation")
result = thinkdeeper_decode(
result, reasoning_tokens = thinkdeeper_decode(
pipeline.current_model,
pipeline.tokenizer,
messages,
Expand Down Expand Up @@ -1850,6 +1901,11 @@ def create(
prompt_tokens = len(pipeline.tokenizer.encode(prompt))
completion_tokens = sum(token_counts)

# Calculate reasoning tokens from all responses
total_reasoning_tokens = 0
for response in responses:
total_reasoning_tokens += count_reasoning_tokens(response, pipeline.tokenizer)

# Create OpenAI-compatible response format
response_dict = {
"id": f"chatcmpl-{int(time.time()*1000)}",
Expand All @@ -1871,7 +1927,8 @@ def create(
"usage": {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": completion_tokens + prompt_tokens
"total_tokens": completion_tokens + prompt_tokens,
"reasoning_tokens": total_reasoning_tokens
}
}

Expand Down
2 changes: 1 addition & 1 deletion optillm/plugins/deep_research/research_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ def decompose_query(self, system_prompt: str, initial_query: str) -> List[str]:
for line in content.split('\n'):
line = line.strip()
if re.match(r'^\d+\.', line):
query = re.sub(r'^\d+\.\s*', '', line).strip()
query = re.sub(r'^\d+\.\s*\[?(.*?)\]?$', r'\1', line).strip()
if query:
queries.append(query)

Expand Down
7 changes: 6 additions & 1 deletion optillm/plugins/deepthink/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,9 @@

A plugin that combines SELF-DISCOVER framework with uncertainty-routed
chain-of-thought for enhanced reasoning capabilities.
"""
"""

from .self_discover import SelfDiscover
from .uncertainty_cot import UncertaintyRoutedCoT

__all__ = ['SelfDiscover', 'UncertaintyRoutedCoT']
Loading