algorithmicsuperintelligence · codelion · Aug 22, 2025 · Aug 15, 2025 · Aug 15, 2025 · Aug 15, 2025
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -79,48 +79,26 @@ jobs:
             type=semver,pattern={{major}}.{{minor}}
             type=raw,value=latest
 
-      # Build and push proxy AMD64
-      - name: Build and push proxy_only Docker image AMD64
+      # Build and push proxy_only multi-arch
+      - name: Build and push proxy_only Docker image (multi-arch)
         uses: docker/build-push-action@v5
         with:
           context: .
           file: Dockerfile.proxy_only
           push: true
-          platforms: linux/amd64
+          platforms: linux/amd64,linux/arm64
           tags: ${{ steps.meta-proxy.outputs.tags }}
           labels: ${{ steps.meta-proxy.outputs.labels }}
-          cache-from: type=gha,scope=proxy-amd64
-          cache-to: type=gha,scope=proxy-amd64,mode=max
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
           outputs: type=registry,compression=zstd,compression-level=5
 
-      # Cleanup after AMD64 build
-      - name: Cleanup after AMD64 build
+      # Cleanup after proxy build
+      - name: Cleanup after proxy build
         run: |
           docker system prune -af
           docker builder prune -af
           df -h
-
-      # Build proxy ARM64
-      - name: Build and push proxy_only Docker image ARM64
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: Dockerfile.proxy_only
-          push: true
-          platforms: linux/arm64
-          tags: ${{ steps.meta-proxy.outputs.tags }}
-          labels: ${{ steps.meta-proxy.outputs.labels }}
-          cache-from: type=gha,scope=proxy-arm64
-          cache-to: type=gha,scope=proxy-arm64,mode=max
-          outputs: type=registry,compression=zstd,compression-level=5
-
-      # Cleanup after proxy builds
-      - name: Cleanup after proxy builds
-        run: |
-          docker system prune -af
-          docker builder prune -af
-          find /tmp -type f -user $(id -u) -exec rm -f {} + 2>/dev/null || true
-          df -h
 
       # Extract metadata for full image
       - name: Extract metadata for Docker
@@ -133,35 +111,15 @@ jobs:
             type=semver,pattern={{major}}.{{minor}}
             latest
 
-      # Build full image AMD64
-      - name: Build and push Docker image AMD64
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          push: true
-          platforms: linux/amd64
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha,scope=full-amd64
-          cache-to: type=gha,scope=full-amd64,mode=max
-          outputs: type=registry,compression=zstd,compression-level=5
-
-      # Cleanup between architectures
-      - name: Cleanup between architectures
-        run: |
-          docker system prune -af
-          docker builder prune -af
-          df -h
-
-      # Build full image ARM64
-      - name: Build and push Docker image ARM64
+      # Build full image multi-arch
+      - name: Build and push Docker image (multi-arch)
         uses: docker/build-push-action@v5
         with:
           context: .
           push: true
-          platforms: linux/arm64
+          platforms: linux/amd64,linux/arm64
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha,scope=full-arm64
-          cache-to: type=gha,scope=full-arm64,mode=max
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
           outputs: type=registry,compression=zstd,compression-level=5
diff --git a/optillm.py b/optillm.py
@@ -93,6 +93,52 @@ def get_config():
         default_client = LiteLLMWrapper()
     return default_client, API_KEY
 
+def count_reasoning_tokens(text: str, tokenizer=None) -> int:
+    """
+    Count tokens within <think>...</think> tags in the given text.
+
+    Args:
+        text: The text to analyze
+        tokenizer: Optional tokenizer instance for precise counting
+
+    Returns:
+        Number of reasoning tokens (0 if no think tags found)
+    """
+    if not text or not isinstance(text, str):
+        return 0
+
+    # Extract all content within <think>...</think> tags
+    # Handle both complete and truncated think blocks
+
+    # First, find all complete <think>...</think> blocks
+    complete_pattern = r'<think>(.*?)</think>'
+    complete_matches = re.findall(complete_pattern, text, re.DOTALL)
+
+    # Then check for unclosed <think> tag (truncated response)
+    # This finds <think> that doesn't have a matching </think> after it
+    truncated_pattern = r'<think>(?!.*</think>)(.*)$'
+    truncated_match = re.search(truncated_pattern, text, re.DOTALL)
+
+    # Combine all thinking content
+    thinking_content = ''.join(complete_matches)
+    if truncated_match:
+        thinking_content += truncated_match.group(1)
+
+    if not thinking_content:
+        return 0
+
+    if tokenizer and hasattr(tokenizer, 'encode'):
+        # Use tokenizer for precise counting
+        try:
+            tokens = tokenizer.encode(thinking_content)
+            return len(tokens)
+        except Exception as e:
+            logger.warning(f"Failed to count tokens with tokenizer: {e}")
+
+    # Fallback: rough estimation (4 chars per token on average, minimum 1 token for non-empty content)
+    content_length = len(thinking_content.strip())
+    return max(1, content_length // 4) if content_length > 0 else 0
+
 # Server configuration
 server_config = {
     'approach': 'none', 
@@ -678,11 +724,22 @@ def proxy():
     if stream:
         return Response(generate_streaming_response(response, model), content_type='text/event-stream')
     else:
+        # Calculate reasoning tokens from the response
+        reasoning_tokens = 0
+        if isinstance(response, str):
+            reasoning_tokens = count_reasoning_tokens(response)
+        elif isinstance(response, list) and response:
+            # For multiple responses, sum up reasoning tokens from all
+            reasoning_tokens = sum(count_reasoning_tokens(resp) for resp in response if isinstance(resp, str))
+
         response_data = {
             'model': model,
             'choices': [],
             'usage': {
                 'completion_tokens': completion_tokens,
+                'completion_tokens_details': {
+                    'reasoning_tokens': reasoning_tokens
+                }
             }
         }
 

diff --git a/optillm/__init__.py b/optillm/__init__.py
@@ -2,7 +2,7 @@
 import os
 
 # Version information
-__version__ = "0.1.22"
+__version__ = "0.1.26"
 
 # Get the path to the root optillm.py
 spec = util.spec_from_file_location(
@@ -27,6 +27,7 @@
 extract_optillm_approach = module.extract_optillm_approach
 get_config = module.get_config
 load_plugins = module.load_plugins
+count_reasoning_tokens = module.count_reasoning_tokens
 
 # Export execution functions
 execute_single_approach = module.execute_single_approach
@@ -48,6 +49,7 @@
     'extract_optillm_approach',
     'get_config',
     'load_plugins',
+    'count_reasoning_tokens',
     'execute_single_approach',
     'execute_combined_approaches',
     'execute_parallel_approaches',

diff --git a/optillm/inference.py b/optillm/inference.py
@@ -18,6 +18,7 @@
 import traceback
 import platform
 import sys
+import re
 
 from optillm.cot_decoding import cot_decode
 from optillm.entropy_decoding import entropy_decode
@@ -29,6 +30,52 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
+def count_reasoning_tokens(text: str, tokenizer=None) -> int:
+    """
+    Count tokens within <think>...</think> tags in the given text.
+
+    Args:
+        text: The text to analyze
+        tokenizer: Optional tokenizer instance for precise counting
+
+    Returns:
+        Number of reasoning tokens (0 if no think tags found)
+    """
+    if not text or not isinstance(text, str):
+        return 0
+
+    # Extract all content within <think>...</think> tags
+    # Handle both complete and truncated think blocks
+
+    # First, find all complete <think>...</think> blocks
+    complete_pattern = r'<think>(.*?)</think>'
+    complete_matches = re.findall(complete_pattern, text, re.DOTALL)
+
+    # Then check for unclosed <think> tag (truncated response)
+    # This finds <think> that doesn't have a matching </think> after it
+    truncated_pattern = r'<think>(?!.*</think>)(.*)$'
+    truncated_match = re.search(truncated_pattern, text, re.DOTALL)
+
+    # Combine all thinking content
+    thinking_content = ''.join(complete_matches)
+    if truncated_match:
+        thinking_content += truncated_match.group(1)
+
+    if not thinking_content:
+        return 0
+
+    if tokenizer and hasattr(tokenizer, 'encode'):
+        # Use tokenizer for precise counting
+        try:
+            tokens = tokenizer.encode(thinking_content)
+            return len(tokens)
+        except Exception as e:
+            logger.warning(f"Failed to count tokens with tokenizer: {e}")
+
+    # Fallback: rough estimation (4 chars per token on average, minimum 1 token for non-empty content)
+    content_length = len(thinking_content.strip())
+    return max(1, content_length // 4) if content_length > 0 else 0
+
 # MLX Support for Apple Silicon
 try:
     import mlx.core as mx
@@ -1502,10 +1549,11 @@ def __init__(
             self.message.logprobs = logprobs
 
 class ChatCompletionUsage:
-    def __init__(self, prompt_tokens: int, completion_tokens: int, total_tokens: int):
+    def __init__(self, prompt_tokens: int, completion_tokens: int, total_tokens: int, reasoning_tokens: int = 0):
         self.prompt_tokens = prompt_tokens
         self.completion_tokens = completion_tokens
         self.total_tokens = total_tokens
+        self.reasoning_tokens = reasoning_tokens
 
 class ChatCompletion:
     def __init__(self, response_dict: Dict):
@@ -1547,7 +1595,10 @@ def model_dump(self) -> Dict:
             "usage": {
                 "prompt_tokens": self.usage.prompt_tokens,
                 "completion_tokens": self.usage.completion_tokens,
-                "total_tokens": self.usage.total_tokens
+                "total_tokens": self.usage.total_tokens,
+                "completion_tokens_details": {
+                    "reasoning_tokens": getattr(self.usage, 'reasoning_tokens', 0)
+                }
             }
         }
 
@@ -1766,15 +1817,15 @@ def create(
 
                                 logger.debug(f"ThinkDeeper tokens: user={user_max_tokens}, thinking={max_thinking_tokens}, adjusted={adjusted_max_tokens}")
 
-                                result = thinkdeeper_decode_mlx(
+                                result, reasoning_tokens = thinkdeeper_decode_mlx(
                                     pipeline.model,
                                     pipeline.tokenizer,
                                     messages,
                                     thinkdeeper_config_with_tokens
                                 )
                             else:
                                 logger.info("Using PyTorch ThinkDeeper implementation")
-                                result = thinkdeeper_decode(
+                                result, reasoning_tokens = thinkdeeper_decode(
                                     pipeline.current_model,
                                     pipeline.tokenizer,
                                     messages,
@@ -1850,6 +1901,11 @@ def create(
                         prompt_tokens = len(pipeline.tokenizer.encode(prompt))
                         completion_tokens = sum(token_counts)
 
+                    # Calculate reasoning tokens from all responses
+                    total_reasoning_tokens = 0
+                    for response in responses:
+                        total_reasoning_tokens += count_reasoning_tokens(response, pipeline.tokenizer)
+
                     # Create OpenAI-compatible response format
                     response_dict = {
                         "id": f"chatcmpl-{int(time.time()*1000)}",
@@ -1871,7 +1927,8 @@ def create(
                         "usage": {
                             "prompt_tokens": prompt_tokens,
                             "completion_tokens": completion_tokens,
-                            "total_tokens": completion_tokens + prompt_tokens
+                            "total_tokens": completion_tokens + prompt_tokens,
+                            "reasoning_tokens": total_reasoning_tokens
                         }
                     }
 

diff --git a/optillm/plugins/deep_research/research_engine.py b/optillm/plugins/deep_research/research_engine.py
@@ -375,7 +375,7 @@ def decompose_query(self, system_prompt: str, initial_query: str) -> List[str]:
             for line in content.split('\n'):
                 line = line.strip()
                 if re.match(r'^\d+\.', line):
-                    query = re.sub(r'^\d+\.\s*', '', line).strip()
+                    query = re.sub(r'^\d+\.\s*\[?(.*?)\]?$', r'\1', line).strip()
                     if query:
                         queries.append(query)
 

diff --git a/optillm/plugins/deepthink/__init__.py b/optillm/plugins/deepthink/__init__.py
@@ -3,4 +3,9 @@
 
 A plugin that combines SELF-DISCOVER framework with uncertainty-routed 
 chain-of-thought for enhanced reasoning capabilities.
-"""
+"""
+
+from .self_discover import SelfDiscover
+from .uncertainty_cot import UncertaintyRoutedCoT
+
+__all__ = ['SelfDiscover', 'UncertaintyRoutedCoT']