Skip to content

Commit 0465bf5

Browse files
authored
Merge pull request #245 from codelion/fix-support-per-provider-concurrency
Fix support per provider concurrency
2 parents 2f99f9b + 01ed43c commit 0465bf5

File tree

6 files changed

+186
-2
lines changed

6 files changed

+186
-2
lines changed

optillm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Version information
2-
__version__ = "0.2.6"
2+
__version__ = "0.2.7"
33

44
# Import from server module
55
from .server import (

optillm/plugins/proxy/README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,15 @@ providers:
3333
base_url: https://api.openai.com/v1
3434
api_key: ${OPENAI_API_KEY}
3535
weight: 2
36+
max_concurrent: 5 # Optional: limit this provider to 5 concurrent requests
3637
model_map:
3738
gpt-4: gpt-4-turbo-preview # Optional: map model names
3839

3940
- name: backup
4041
base_url: https://api.openai.com/v1
4142
api_key: ${OPENAI_API_KEY_BACKUP}
4243
weight: 1
44+
max_concurrent: 2 # Optional: limit this provider to 2 concurrent requests
4345

4446
routing:
4547
strategy: weighted # Options: weighted, round_robin, failover
@@ -189,6 +191,39 @@ queue:
189191
- **Automatic Failover**: When a provider times out, it's marked unhealthy and the request automatically fails over to the next available provider.
190192
- **Protection**: Prevents slow backends from causing queue buildup that can crash the proxy server.
191193

194+
### Per-Provider Concurrency Limits
195+
196+
Control the maximum number of concurrent requests each provider can handle:
197+
198+
```yaml
199+
providers:
200+
- name: slow_server
201+
base_url: http://192.168.1.100:8080/v1
202+
api_key: dummy
203+
max_concurrent: 1 # This server can only handle 1 request at a time
204+
205+
- name: fast_server
206+
base_url: https://api.fast.com/v1
207+
api_key: ${API_KEY}
208+
max_concurrent: 10 # This server can handle 10 concurrent requests
209+
210+
- name: unlimited_server
211+
base_url: https://api.unlimited.com/v1
212+
api_key: ${API_KEY}
213+
# No max_concurrent means no limit for this provider
214+
```
215+
216+
**Use Cases:**
217+
- **Hardware-limited servers**: Set `max_concurrent: 1` for servers that can't handle parallel requests
218+
- **Rate limiting**: Prevent overwhelming providers with too many concurrent requests
219+
- **Resource management**: Balance load across providers with different capacities
220+
- **Cost control**: Limit expensive providers while allowing more requests to cheaper ones
221+
222+
**Behavior:**
223+
- If a provider is at max capacity, the proxy tries the next available provider
224+
- Requests wait briefly (0.5s) for a slot before moving to the next provider
225+
- Works with all routing strategies (weighted, round_robin, failover)
226+
192227
### Environment Variables
193228

194229
The configuration supports flexible environment variable interpolation:

optillm/plugins/proxy/client.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,14 @@ def __init__(self, config: Dict):
2727
self.last_error = None
2828
self.latencies = [] # Track recent latencies
2929

30+
# Per-provider concurrency control
31+
self.max_concurrent = config.get('max_concurrent', None) # None means no limit
32+
if self.max_concurrent is not None:
33+
self._semaphore = threading.Semaphore(self.max_concurrent)
34+
logger.info(f"Provider {self.name} limited to {self.max_concurrent} concurrent requests")
35+
else:
36+
self._semaphore = None
37+
3038
@property
3139
def client(self):
3240
"""Lazy initialization of OpenAI client"""
@@ -39,6 +47,13 @@ def client(self):
3947
api_version="2024-02-01",
4048
max_retries=0 # Disable client retries - we handle them
4149
)
50+
elif 'generativelanguage.googleapis.com' in self.base_url:
51+
# Google AI client - create custom client to avoid "models/" prefix
52+
from optillm.plugins.proxy.google_client import GoogleAIClient
53+
self._client = GoogleAIClient(
54+
api_key=self.api_key,
55+
base_url=self.base_url
56+
)
4257
else:
4358
# Standard OpenAI-compatible client
4459
self._client = OpenAI(
@@ -63,6 +78,28 @@ def avg_latency(self) -> float:
6378
if not self.latencies:
6479
return 0
6580
return sum(self.latencies) / len(self.latencies)
81+
82+
def acquire_slot(self, timeout: Optional[float] = None) -> bool:
83+
"""
84+
Try to acquire a slot for this provider.
85+
Returns True if acquired, False if timeout or no limit.
86+
"""
87+
if self._semaphore is None:
88+
return True # No limit, always available
89+
90+
return self._semaphore.acquire(blocking=True, timeout=timeout)
91+
92+
def release_slot(self):
93+
"""Release a slot for this provider."""
94+
if self._semaphore is not None:
95+
self._semaphore.release()
96+
97+
def available_slots(self) -> Optional[int]:
98+
"""Get number of available slots, None if unlimited."""
99+
if self._semaphore is None:
100+
return None
101+
# Note: _value is internal but there's no public method to check availability
102+
return self._semaphore._value
66103

67104
class ProxyClient:
68105
"""OpenAI-compatible client that proxies to multiple providers"""
@@ -185,6 +222,13 @@ def create(self, **kwargs):
185222

186223
attempted_providers.add(provider)
187224

225+
# Try to acquire a slot for this provider (with reasonable timeout for queueing)
226+
slot_timeout = 10.0 # Wait up to 10 seconds for provider to become available
227+
if not provider.acquire_slot(timeout=slot_timeout):
228+
logger.debug(f"Provider {provider.name} at max capacity, trying next provider")
229+
errors.append((provider.name, "At max concurrent requests"))
230+
continue
231+
188232
try:
189233
# Map model name if needed and filter out OptiLLM-specific parameters
190234
request_kwargs = self._filter_kwargs(kwargs.copy())
@@ -225,6 +269,11 @@ def create(self, **kwargs):
225269
if self.proxy_client.track_errors:
226270
provider.is_healthy = False
227271
provider.last_error = str(e)
272+
273+
finally:
274+
# Always release the provider slot
275+
provider.release_slot()
276+
logger.debug(f"Released slot for provider {provider.name}")
228277

229278
# All providers failed, try fallback client
230279
if self.proxy_client.fallback_client:

optillm/plugins/proxy/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,8 @@ def _apply_defaults(config: Dict) -> Dict:
172172
provider.setdefault('weight', 1)
173173
provider.setdefault('fallback_only', False)
174174
provider.setdefault('model_map', {})
175+
# Per-provider concurrency limit (None means no limit)
176+
provider.setdefault('max_concurrent', None)
175177

176178
return config
177179

@@ -200,6 +202,12 @@ def _validate_config(config: Dict) -> Dict:
200202
if provider['weight'] <= 0:
201203
logger.warning(f"Provider {provider['name']} has invalid weight {provider['weight']}, setting to 1")
202204
provider['weight'] = 1
205+
206+
# Validate max_concurrent if specified
207+
if provider.get('max_concurrent') is not None:
208+
if not isinstance(provider['max_concurrent'], int) or provider['max_concurrent'] <= 0:
209+
logger.warning(f"Provider {provider['name']} has invalid max_concurrent {provider['max_concurrent']}, removing limit")
210+
provider['max_concurrent'] = None
203211

204212
# Validate routing strategy
205213
valid_strategies = ['weighted', 'round_robin', 'failover']
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
"""
2+
Custom Google AI client that doesn't add "models/" prefix to model names
3+
"""
4+
import requests
5+
import json
6+
from typing import Dict, List, Any
7+
8+
9+
class GoogleAIClient:
10+
"""Custom client for Google AI that bypasses OpenAI client's model name prefix behavior"""
11+
12+
def __init__(self, api_key: str, base_url: str):
13+
self.api_key = api_key
14+
self.base_url = base_url.rstrip('/')
15+
self.chat = self.Chat(self)
16+
self.models = self.Models(self)
17+
18+
class Chat:
19+
def __init__(self, client):
20+
self.client = client
21+
self.completions = self.Completions(client)
22+
23+
class Completions:
24+
def __init__(self, client):
25+
self.client = client
26+
27+
def create(self, model: str, messages: List[Dict[str, str]], **kwargs) -> Any:
28+
"""Create chat completion without adding models/ prefix to model name"""
29+
url = f"{self.client.base_url}/chat/completions"
30+
31+
headers = {
32+
"Content-Type": "application/json",
33+
"Authorization": f"Bearer {self.client.api_key}"
34+
}
35+
36+
# Build request data - use model name directly without "models/" prefix
37+
data = {
38+
"model": model, # Use exactly as provided - no prefix!
39+
"messages": messages,
40+
**kwargs
41+
}
42+
43+
# Make direct HTTP request to bypass OpenAI client behavior
44+
response = requests.post(url, headers=headers, json=data, timeout=kwargs.get('timeout', 30))
45+
46+
if response.status_code != 200:
47+
error_text = response.text
48+
raise Exception(f"HTTP {response.status_code}: {error_text}")
49+
50+
# Parse response and return OpenAI-compatible object
51+
result = response.json()
52+
53+
# Create a simple object that has the attributes expected by the proxy
54+
class CompletionResponse:
55+
def __init__(self, data):
56+
self._data = data
57+
self.choices = data.get('choices', [])
58+
self.usage = data.get('usage', {})
59+
self.model = data.get('model', model)
60+
61+
def model_dump(self):
62+
return self._data
63+
64+
def __getitem__(self, key):
65+
return self._data[key]
66+
67+
def get(self, key, default=None):
68+
return self._data.get(key, default)
69+
70+
return CompletionResponse(result)
71+
72+
class Models:
73+
def __init__(self, client):
74+
self.client = client
75+
76+
def list(self):
77+
"""Simple models list for health checking"""
78+
url = f"{self.client.base_url}/models"
79+
headers = {
80+
"Authorization": f"Bearer {self.client.api_key}"
81+
}
82+
83+
try:
84+
response = requests.get(url, headers=headers, timeout=5)
85+
if response.status_code == 200:
86+
return response.json()
87+
else:
88+
# Return a mock response if health check fails
89+
return {"data": [{"id": "gemma-3-4b-it"}]}
90+
except:
91+
# Return a mock response if health check fails
92+
return {"data": [{"id": "gemma-3-4b-it"}]}

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "optillm"
7-
version = "0.2.6"
7+
version = "0.2.7"
88
description = "An optimizing inference proxy for LLMs."
99
readme = "README.md"
1010
license = "Apache-2.0"

0 commit comments

Comments
 (0)