Skip to content

Commit af093a3

Browse files
authored
Merge pull request #221 from codelion/fix-loading-bugs
Fix loading bugs
2 parents 050a14c + 23a16bf commit af093a3

26 files changed

+1926
-186
lines changed

.github/workflows/publish.yml

Lines changed: 12 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -79,48 +79,26 @@ jobs:
7979
type=semver,pattern={{major}}.{{minor}}
8080
type=raw,value=latest
8181
82-
# Build and push proxy AMD64
83-
- name: Build and push proxy_only Docker image AMD64
82+
# Build and push proxy_only multi-arch
83+
- name: Build and push proxy_only Docker image (multi-arch)
8484
uses: docker/build-push-action@v5
8585
with:
8686
context: .
8787
file: Dockerfile.proxy_only
8888
push: true
89-
platforms: linux/amd64
89+
platforms: linux/amd64,linux/arm64
9090
tags: ${{ steps.meta-proxy.outputs.tags }}
9191
labels: ${{ steps.meta-proxy.outputs.labels }}
92-
cache-from: type=gha,scope=proxy-amd64
93-
cache-to: type=gha,scope=proxy-amd64,mode=max
92+
cache-from: type=gha
93+
cache-to: type=gha,mode=max
9494
outputs: type=registry,compression=zstd,compression-level=5
9595

96-
# Cleanup after AMD64 build
97-
- name: Cleanup after AMD64 build
96+
# Cleanup after proxy build
97+
- name: Cleanup after proxy build
9898
run: |
9999
docker system prune -af
100100
docker builder prune -af
101101
df -h
102-
103-
# Build proxy ARM64
104-
- name: Build and push proxy_only Docker image ARM64
105-
uses: docker/build-push-action@v5
106-
with:
107-
context: .
108-
file: Dockerfile.proxy_only
109-
push: true
110-
platforms: linux/arm64
111-
tags: ${{ steps.meta-proxy.outputs.tags }}
112-
labels: ${{ steps.meta-proxy.outputs.labels }}
113-
cache-from: type=gha,scope=proxy-arm64
114-
cache-to: type=gha,scope=proxy-arm64,mode=max
115-
outputs: type=registry,compression=zstd,compression-level=5
116-
117-
# Cleanup after proxy builds
118-
- name: Cleanup after proxy builds
119-
run: |
120-
docker system prune -af
121-
docker builder prune -af
122-
find /tmp -type f -user $(id -u) -exec rm -f {} + 2>/dev/null || true
123-
df -h
124102
125103
# Extract metadata for full image
126104
- name: Extract metadata for Docker
@@ -133,35 +111,15 @@ jobs:
133111
type=semver,pattern={{major}}.{{minor}}
134112
latest
135113
136-
# Build full image AMD64
137-
- name: Build and push Docker image AMD64
138-
uses: docker/build-push-action@v5
139-
with:
140-
context: .
141-
push: true
142-
platforms: linux/amd64
143-
tags: ${{ steps.meta.outputs.tags }}
144-
labels: ${{ steps.meta.outputs.labels }}
145-
cache-from: type=gha,scope=full-amd64
146-
cache-to: type=gha,scope=full-amd64,mode=max
147-
outputs: type=registry,compression=zstd,compression-level=5
148-
149-
# Cleanup between architectures
150-
- name: Cleanup between architectures
151-
run: |
152-
docker system prune -af
153-
docker builder prune -af
154-
df -h
155-
156-
# Build full image ARM64
157-
- name: Build and push Docker image ARM64
114+
# Build full image multi-arch
115+
- name: Build and push Docker image (multi-arch)
158116
uses: docker/build-push-action@v5
159117
with:
160118
context: .
161119
push: true
162-
platforms: linux/arm64
120+
platforms: linux/amd64,linux/arm64
163121
tags: ${{ steps.meta.outputs.tags }}
164122
labels: ${{ steps.meta.outputs.labels }}
165-
cache-from: type=gha,scope=full-arm64
166-
cache-to: type=gha,scope=full-arm64,mode=max
123+
cache-from: type=gha
124+
cache-to: type=gha,mode=max
167125
outputs: type=registry,compression=zstd,compression-level=5

optillm.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,52 @@ def get_config():
9393
default_client = LiteLLMWrapper()
9494
return default_client, API_KEY
9595

96+
def count_reasoning_tokens(text: str, tokenizer=None) -> int:
97+
"""
98+
Count tokens within <think>...</think> tags in the given text.
99+
100+
Args:
101+
text: The text to analyze
102+
tokenizer: Optional tokenizer instance for precise counting
103+
104+
Returns:
105+
Number of reasoning tokens (0 if no think tags found)
106+
"""
107+
if not text or not isinstance(text, str):
108+
return 0
109+
110+
# Extract all content within <think>...</think> tags
111+
# Handle both complete and truncated think blocks
112+
113+
# First, find all complete <think>...</think> blocks
114+
complete_pattern = r'<think>(.*?)</think>'
115+
complete_matches = re.findall(complete_pattern, text, re.DOTALL)
116+
117+
# Then check for unclosed <think> tag (truncated response)
118+
# This finds <think> that doesn't have a matching </think> after it
119+
truncated_pattern = r'<think>(?!.*</think>)(.*)$'
120+
truncated_match = re.search(truncated_pattern, text, re.DOTALL)
121+
122+
# Combine all thinking content
123+
thinking_content = ''.join(complete_matches)
124+
if truncated_match:
125+
thinking_content += truncated_match.group(1)
126+
127+
if not thinking_content:
128+
return 0
129+
130+
if tokenizer and hasattr(tokenizer, 'encode'):
131+
# Use tokenizer for precise counting
132+
try:
133+
tokens = tokenizer.encode(thinking_content)
134+
return len(tokens)
135+
except Exception as e:
136+
logger.warning(f"Failed to count tokens with tokenizer: {e}")
137+
138+
# Fallback: rough estimation (4 chars per token on average, minimum 1 token for non-empty content)
139+
content_length = len(thinking_content.strip())
140+
return max(1, content_length // 4) if content_length > 0 else 0
141+
96142
# Server configuration
97143
server_config = {
98144
'approach': 'none',
@@ -678,11 +724,22 @@ def proxy():
678724
if stream:
679725
return Response(generate_streaming_response(response, model), content_type='text/event-stream')
680726
else:
727+
# Calculate reasoning tokens from the response
728+
reasoning_tokens = 0
729+
if isinstance(response, str):
730+
reasoning_tokens = count_reasoning_tokens(response)
731+
elif isinstance(response, list) and response:
732+
# For multiple responses, sum up reasoning tokens from all
733+
reasoning_tokens = sum(count_reasoning_tokens(resp) for resp in response if isinstance(resp, str))
734+
681735
response_data = {
682736
'model': model,
683737
'choices': [],
684738
'usage': {
685739
'completion_tokens': completion_tokens,
740+
'completion_tokens_details': {
741+
'reasoning_tokens': reasoning_tokens
742+
}
686743
}
687744
}
688745

optillm/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import os
33

44
# Version information
5-
__version__ = "0.1.22"
5+
__version__ = "0.1.26"
66

77
# Get the path to the root optillm.py
88
spec = util.spec_from_file_location(
@@ -27,6 +27,7 @@
2727
extract_optillm_approach = module.extract_optillm_approach
2828
get_config = module.get_config
2929
load_plugins = module.load_plugins
30+
count_reasoning_tokens = module.count_reasoning_tokens
3031

3132
# Export execution functions
3233
execute_single_approach = module.execute_single_approach
@@ -48,6 +49,7 @@
4849
'extract_optillm_approach',
4950
'get_config',
5051
'load_plugins',
52+
'count_reasoning_tokens',
5153
'execute_single_approach',
5254
'execute_combined_approaches',
5355
'execute_parallel_approaches',

optillm/inference.py

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import traceback
1919
import platform
2020
import sys
21+
import re
2122

2223
from optillm.cot_decoding import cot_decode
2324
from optillm.entropy_decoding import entropy_decode
@@ -29,6 +30,52 @@
2930
logging.basicConfig(level=logging.INFO)
3031
logger = logging.getLogger(__name__)
3132

33+
def count_reasoning_tokens(text: str, tokenizer=None) -> int:
34+
"""
35+
Count tokens within <think>...</think> tags in the given text.
36+
37+
Args:
38+
text: The text to analyze
39+
tokenizer: Optional tokenizer instance for precise counting
40+
41+
Returns:
42+
Number of reasoning tokens (0 if no think tags found)
43+
"""
44+
if not text or not isinstance(text, str):
45+
return 0
46+
47+
# Extract all content within <think>...</think> tags
48+
# Handle both complete and truncated think blocks
49+
50+
# First, find all complete <think>...</think> blocks
51+
complete_pattern = r'<think>(.*?)</think>'
52+
complete_matches = re.findall(complete_pattern, text, re.DOTALL)
53+
54+
# Then check for unclosed <think> tag (truncated response)
55+
# This finds <think> that doesn't have a matching </think> after it
56+
truncated_pattern = r'<think>(?!.*</think>)(.*)$'
57+
truncated_match = re.search(truncated_pattern, text, re.DOTALL)
58+
59+
# Combine all thinking content
60+
thinking_content = ''.join(complete_matches)
61+
if truncated_match:
62+
thinking_content += truncated_match.group(1)
63+
64+
if not thinking_content:
65+
return 0
66+
67+
if tokenizer and hasattr(tokenizer, 'encode'):
68+
# Use tokenizer for precise counting
69+
try:
70+
tokens = tokenizer.encode(thinking_content)
71+
return len(tokens)
72+
except Exception as e:
73+
logger.warning(f"Failed to count tokens with tokenizer: {e}")
74+
75+
# Fallback: rough estimation (4 chars per token on average, minimum 1 token for non-empty content)
76+
content_length = len(thinking_content.strip())
77+
return max(1, content_length // 4) if content_length > 0 else 0
78+
3279
# MLX Support for Apple Silicon
3380
try:
3481
import mlx.core as mx
@@ -1502,10 +1549,11 @@ def __init__(
15021549
self.message.logprobs = logprobs
15031550

15041551
class ChatCompletionUsage:
1505-
def __init__(self, prompt_tokens: int, completion_tokens: int, total_tokens: int):
1552+
def __init__(self, prompt_tokens: int, completion_tokens: int, total_tokens: int, reasoning_tokens: int = 0):
15061553
self.prompt_tokens = prompt_tokens
15071554
self.completion_tokens = completion_tokens
15081555
self.total_tokens = total_tokens
1556+
self.reasoning_tokens = reasoning_tokens
15091557

15101558
class ChatCompletion:
15111559
def __init__(self, response_dict: Dict):
@@ -1547,7 +1595,10 @@ def model_dump(self) -> Dict:
15471595
"usage": {
15481596
"prompt_tokens": self.usage.prompt_tokens,
15491597
"completion_tokens": self.usage.completion_tokens,
1550-
"total_tokens": self.usage.total_tokens
1598+
"total_tokens": self.usage.total_tokens,
1599+
"completion_tokens_details": {
1600+
"reasoning_tokens": getattr(self.usage, 'reasoning_tokens', 0)
1601+
}
15511602
}
15521603
}
15531604

@@ -1766,15 +1817,15 @@ def create(
17661817

17671818
logger.debug(f"ThinkDeeper tokens: user={user_max_tokens}, thinking={max_thinking_tokens}, adjusted={adjusted_max_tokens}")
17681819

1769-
result = thinkdeeper_decode_mlx(
1820+
result, reasoning_tokens = thinkdeeper_decode_mlx(
17701821
pipeline.model,
17711822
pipeline.tokenizer,
17721823
messages,
17731824
thinkdeeper_config_with_tokens
17741825
)
17751826
else:
17761827
logger.info("Using PyTorch ThinkDeeper implementation")
1777-
result = thinkdeeper_decode(
1828+
result, reasoning_tokens = thinkdeeper_decode(
17781829
pipeline.current_model,
17791830
pipeline.tokenizer,
17801831
messages,
@@ -1850,6 +1901,11 @@ def create(
18501901
prompt_tokens = len(pipeline.tokenizer.encode(prompt))
18511902
completion_tokens = sum(token_counts)
18521903

1904+
# Calculate reasoning tokens from all responses
1905+
total_reasoning_tokens = 0
1906+
for response in responses:
1907+
total_reasoning_tokens += count_reasoning_tokens(response, pipeline.tokenizer)
1908+
18531909
# Create OpenAI-compatible response format
18541910
response_dict = {
18551911
"id": f"chatcmpl-{int(time.time()*1000)}",
@@ -1871,7 +1927,8 @@ def create(
18711927
"usage": {
18721928
"prompt_tokens": prompt_tokens,
18731929
"completion_tokens": completion_tokens,
1874-
"total_tokens": completion_tokens + prompt_tokens
1930+
"total_tokens": completion_tokens + prompt_tokens,
1931+
"reasoning_tokens": total_reasoning_tokens
18751932
}
18761933
}
18771934

optillm/plugins/deep_research/research_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ def decompose_query(self, system_prompt: str, initial_query: str) -> List[str]:
375375
for line in content.split('\n'):
376376
line = line.strip()
377377
if re.match(r'^\d+\.', line):
378-
query = re.sub(r'^\d+\.\s*', '', line).strip()
378+
query = re.sub(r'^\d+\.\s*\[?(.*?)\]?$', r'\1', line).strip()
379379
if query:
380380
queries.append(query)
381381

optillm/plugins/deepthink/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,9 @@
33
44
A plugin that combines SELF-DISCOVER framework with uncertainty-routed
55
chain-of-thought for enhanced reasoning capabilities.
6-
"""
6+
"""
7+
8+
from .self_discover import SelfDiscover
9+
from .uncertainty_cot import UncertaintyRoutedCoT
10+
11+
__all__ = ['SelfDiscover', 'UncertaintyRoutedCoT']

0 commit comments

Comments
 (0)