Skip to content

Commit 2f99f9b

Browse files
authored
Merge pull request #244 from codelion/fix-proxy-timeout
Fix proxy timeout
2 parents 1d77d3e + e5a0dcc commit 2f99f9b

File tree

6 files changed

+277
-68
lines changed

6 files changed

+277
-68
lines changed

optillm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Version information
2-
__version__ = "0.2.5"
2+
__version__ = "0.2.6"
33

44
# Import from server module
55
from .server import (

optillm/plugins/proxy/README.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,14 @@ providers:
4343

4444
routing:
4545
strategy: weighted # Options: weighted, round_robin, failover
46+
47+
timeouts:
48+
request: 30 # Maximum seconds to wait for a provider response
49+
connect: 5 # Maximum seconds to wait for connection
50+
51+
queue:
52+
max_concurrent: 100 # Maximum concurrent requests to prevent overload
53+
timeout: 60 # Maximum seconds a request can wait in queue
4654
```
4755
4856
### 2. Start OptiLLM Server
@@ -161,6 +169,26 @@ routing:
161169
timeout: 5 # Timeout for health check requests
162170
```
163171
172+
### Timeout and Queue Management
173+
174+
Prevent request queue backup and handle slow/unresponsive backends:
175+
176+
```yaml
177+
timeouts:
178+
request: 30 # Maximum seconds to wait for provider response (default: 30)
179+
connect: 5 # Maximum seconds for initial connection (default: 5)
180+
181+
queue:
182+
max_concurrent: 100 # Maximum concurrent requests (default: 100)
183+
timeout: 60 # Maximum seconds in queue before rejection (default: 60)
184+
```
185+
186+
**How it works:**
187+
- **Request Timeout**: Each request to a provider has a maximum time limit. If exceeded, the request is cancelled and the next provider is tried.
188+
- **Queue Management**: Limits concurrent requests to prevent memory exhaustion. New requests wait up to `queue.timeout` seconds before being rejected.
189+
- **Automatic Failover**: When a provider times out, it's marked unhealthy and the request automatically fails over to the next available provider.
190+
- **Protection**: Prevents slow backends from causing queue buildup that can crash the proxy server.
191+
164192
### Environment Variables
165193

166194
The configuration supports flexible environment variable interpolation:

optillm/plugins/proxy/client.py

Lines changed: 116 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import logging
66
import random
77
from typing import Dict, List, Any, Optional
8+
import concurrent.futures
9+
import threading
810
from openai import OpenAI, AzureOpenAI
911
from optillm.plugins.proxy.routing import RouterFactory
1012
from optillm.plugins.proxy.health import HealthChecker
@@ -34,13 +36,15 @@ def client(self):
3436
self._client = AzureOpenAI(
3537
api_key=self.api_key,
3638
azure_endpoint=self.base_url,
37-
api_version="2024-02-01"
39+
api_version="2024-02-01",
40+
max_retries=0 # Disable client retries - we handle them
3841
)
3942
else:
4043
# Standard OpenAI-compatible client
4144
self._client = OpenAI(
4245
api_key=self.api_key,
43-
base_url=self.base_url
46+
base_url=self.base_url,
47+
max_retries=0 # Disable client retries - we handle them
4448
)
4549
return self._client
4650

@@ -97,6 +101,17 @@ def __init__(self, config: Dict, fallback_client=None):
97101
# Start health checking
98102
self.health_checker.start()
99103

104+
# Timeout settings
105+
timeout_config = config.get('timeouts', {})
106+
self.request_timeout = timeout_config.get('request', 30) # Default 30 seconds
107+
self.connect_timeout = timeout_config.get('connect', 5) # Default 5 seconds
108+
109+
# Queue management settings
110+
queue_config = config.get('queue', {})
111+
self.max_concurrent_requests = queue_config.get('max_concurrent', 100)
112+
self.queue_timeout = queue_config.get('timeout', 60) # Max time in queue
113+
self._request_semaphore = threading.Semaphore(self.max_concurrent_requests)
114+
100115
# Monitoring settings
101116
monitoring = config.get('monitoring', {})
102117
self.track_latency = monitoring.get('track_latency', True)
@@ -123,74 +138,109 @@ def _filter_kwargs(self, kwargs: dict) -> dict:
123138
}
124139
return {k: v for k, v in kwargs.items() if k not in optillm_params}
125140

141+
def _make_request_with_timeout(self, provider, request_kwargs):
142+
"""Make a request with timeout handling"""
143+
# The OpenAI client now supports timeout natively
144+
try:
145+
response = provider.client.chat.completions.create(**request_kwargs)
146+
return response
147+
except Exception as e:
148+
# Check if it's a timeout error
149+
if "timeout" in str(e).lower() or "timed out" in str(e).lower():
150+
raise TimeoutError(f"Request to {provider.name} timed out after {self.proxy_client.request_timeout}s")
151+
raise e
152+
126153
def create(self, **kwargs):
127-
"""Create completion with load balancing and failover"""
128-
model = kwargs.get('model', 'unknown')
129-
attempted_providers = set()
130-
errors = []
131-
132-
# Get healthy providers
133-
healthy_providers = [
134-
p for p in self.proxy_client.active_providers
135-
if p.is_healthy
136-
]
154+
"""Create completion with load balancing, failover, and timeout handling"""
155+
# Check queue capacity
156+
if not self.proxy_client._request_semaphore.acquire(blocking=True, timeout=self.proxy_client.queue_timeout):
157+
raise TimeoutError(f"Request queue timeout after {self.proxy_client.queue_timeout}s - server overloaded")
137158

138-
if not healthy_providers:
139-
logger.warning("No healthy providers, trying fallback providers")
140-
healthy_providers = self.proxy_client.fallback_providers
141-
142-
# Try routing through healthy providers
143-
while healthy_providers:
144-
available_providers = [p for p in healthy_providers if p not in attempted_providers]
145-
if not available_providers:
146-
break
147-
148-
provider = self.proxy_client.router.select(available_providers)
149-
logger.info(f"Router selected provider: {provider.name if provider else 'None'}")
159+
try:
160+
model = kwargs.get('model', 'unknown')
161+
attempted_providers = set()
162+
errors = []
150163

151-
if not provider:
152-
break
153-
154-
attempted_providers.add(provider)
164+
# Get healthy providers
165+
healthy_providers = [
166+
p for p in self.proxy_client.active_providers
167+
if p.is_healthy
168+
]
155169

156-
try:
157-
# Map model name if needed and filter out OptiLLM-specific parameters
158-
request_kwargs = self._filter_kwargs(kwargs.copy())
159-
request_kwargs['model'] = provider.map_model(model)
160-
161-
# Track timing
162-
start_time = time.time()
163-
164-
# Make request
165-
logger.debug(f"Routing to {provider.name}")
166-
response = provider.client.chat.completions.create(**request_kwargs)
167-
168-
# Track success
169-
latency = time.time() - start_time
170-
if self.proxy_client.track_latency:
171-
provider.track_latency(latency)
172-
173-
logger.info(f"Request succeeded via {provider.name} in {latency:.2f}s")
174-
return response
170+
if not healthy_providers:
171+
logger.warning("No healthy providers, trying fallback providers")
172+
healthy_providers = self.proxy_client.fallback_providers
173+
174+
# Try routing through healthy providers
175+
while healthy_providers:
176+
available_providers = [p for p in healthy_providers if p not in attempted_providers]
177+
if not available_providers:
178+
break
179+
180+
provider = self.proxy_client.router.select(available_providers)
181+
logger.info(f"Router selected provider: {provider.name if provider else 'None'}")
175182

176-
except Exception as e:
177-
logger.error(f"Provider {provider.name} failed: {e}")
178-
errors.append((provider.name, str(e)))
183+
if not provider:
184+
break
185+
186+
attempted_providers.add(provider)
179187

180-
# Mark provider as unhealthy
181-
if self.proxy_client.track_errors:
182-
provider.is_healthy = False
183-
provider.last_error = str(e)
188+
try:
189+
# Map model name if needed and filter out OptiLLM-specific parameters
190+
request_kwargs = self._filter_kwargs(kwargs.copy())
191+
request_kwargs['model'] = provider.map_model(model)
192+
193+
# Add timeout to client if supported
194+
request_kwargs['timeout'] = self.proxy_client.request_timeout
195+
196+
# Track timing
197+
start_time = time.time()
198+
199+
# Make request with timeout
200+
logger.debug(f"Routing to {provider.name} with {self.proxy_client.request_timeout}s timeout")
201+
response = self._make_request_with_timeout(provider, request_kwargs)
202+
203+
# Track success
204+
latency = time.time() - start_time
205+
if self.proxy_client.track_latency:
206+
provider.track_latency(latency)
207+
208+
logger.info(f"Request succeeded via {provider.name} in {latency:.2f}s")
209+
return response
210+
211+
except TimeoutError as e:
212+
logger.error(f"Provider {provider.name} timed out: {e}")
213+
errors.append((provider.name, str(e)))
214+
215+
# Mark provider as unhealthy on timeout
216+
if self.proxy_client.track_errors:
217+
provider.is_healthy = False
218+
provider.last_error = f"Timeout: {str(e)}"
219+
220+
except Exception as e:
221+
logger.error(f"Provider {provider.name} failed: {e}")
222+
errors.append((provider.name, str(e)))
223+
224+
# Mark provider as unhealthy
225+
if self.proxy_client.track_errors:
226+
provider.is_healthy = False
227+
provider.last_error = str(e)
184228

185-
# All providers failed, try fallback client
186-
if self.proxy_client.fallback_client:
187-
logger.warning("All proxy providers failed, using fallback client")
188-
try:
189-
return self.proxy_client.fallback_client.chat.completions.create(**self._filter_kwargs(kwargs))
190-
except Exception as e:
191-
errors.append(("fallback_client", str(e)))
192-
193-
# Complete failure
194-
error_msg = f"All providers failed. Errors: {errors}"
195-
logger.error(error_msg)
196-
raise Exception(error_msg)
229+
# All providers failed, try fallback client
230+
if self.proxy_client.fallback_client:
231+
logger.warning("All proxy providers failed, using fallback client")
232+
try:
233+
fallback_kwargs = self._filter_kwargs(kwargs)
234+
fallback_kwargs['timeout'] = self.proxy_client.request_timeout
235+
return self.proxy_client.fallback_client.chat.completions.create(**fallback_kwargs)
236+
except Exception as e:
237+
errors.append(("fallback_client", str(e)))
238+
239+
# Complete failure
240+
error_msg = f"All providers failed. Errors: {errors}"
241+
logger.error(error_msg)
242+
raise Exception(error_msg)
243+
244+
finally:
245+
# Release semaphore to allow next request
246+
self.proxy_client._request_semaphore.release()

optillm/plugins/proxy/config.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,8 @@ def _apply_defaults(config: Dict) -> Dict:
137137
config.setdefault('providers', [])
138138
config.setdefault('routing', {})
139139
config.setdefault('monitoring', {})
140+
config.setdefault('timeouts', {})
141+
config.setdefault('queue', {})
140142

141143
# Routing defaults
142144
routing = config['routing']
@@ -154,6 +156,16 @@ def _apply_defaults(config: Dict) -> Dict:
154156
monitoring.setdefault('track_latency', True)
155157
monitoring.setdefault('track_errors', True)
156158

159+
# Timeout defaults
160+
timeouts = config['timeouts']
161+
timeouts.setdefault('request', 30) # 30 seconds for requests
162+
timeouts.setdefault('connect', 5) # 5 seconds for connection
163+
164+
# Queue management defaults
165+
queue = config['queue']
166+
queue.setdefault('max_concurrent', 100) # Max concurrent requests
167+
queue.setdefault('timeout', 60) # Max time waiting in queue
168+
157169
# Provider defaults
158170
for i, provider in enumerate(config['providers']):
159171
provider.setdefault('name', f"provider_{i}")
@@ -224,6 +236,14 @@ def _create_default(path: Path):
224236
interval: 30 # seconds
225237
timeout: 5 # seconds
226238
239+
timeouts:
240+
request: 30 # Maximum time for a request (seconds)
241+
connect: 5 # Maximum time for connection (seconds)
242+
243+
queue:
244+
max_concurrent: 100 # Maximum concurrent requests
245+
timeout: 60 # Maximum time in queue (seconds)
246+
227247
monitoring:
228248
log_level: INFO
229249
track_latency: true
@@ -244,6 +264,14 @@ def _get_minimal_config() -> Dict:
244264
'strategy': 'round_robin',
245265
'health_check': {'enabled': False}
246266
},
267+
'timeouts': {
268+
'request': 30,
269+
'connect': 5
270+
},
271+
'queue': {
272+
'max_concurrent': 100,
273+
'timeout': 60
274+
},
247275
'monitoring': {
248276
'log_level': 'INFO',
249277
'track_latency': False,

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "optillm"
7-
version = "0.2.5"
7+
version = "0.2.6"
88
description = "An optimizing inference proxy for LLMs."
99
readme = "README.md"
1010
license = "Apache-2.0"

0 commit comments

Comments
 (0)