feat: add cache all and limit cache point in AnthropicModel

Wh1isper · Wh1isper · commit b4862a0a329a · 2025-11-19T11:29:54.000+08:00
diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
@@ -80,18 +80,29 @@ agent = Agent(model)
 
 ## Prompt Caching
 
-Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching) to reduce costs by caching parts of your prompts. Pydantic AI provides three ways to use prompt caching:
+Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching) to reduce costs by caching parts of your prompts. Pydantic AI provides four ways to use prompt caching:
 
 1. **Cache User Messages with [`CachePoint`][pydantic_ai.messages.CachePoint]**: Insert a `CachePoint` marker in your user messages to cache everything before it
 2. **Cache System Instructions**: Set [`AnthropicModelSettings.anthropic_cache_instructions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_instructions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
 3. **Cache Tool Definitions**: Set [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
+4. **Cache All (Convenience)**: Set [`AnthropicModelSettings.anthropic_cache_all`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_all] to `True` to automatically cache both system instructions and the last user message
 
-You can combine all three strategies for maximum savings:
+You can combine multiple strategies for maximum savings:
 
 ```python {test="skip"}
 from pydantic_ai import Agent, CachePoint, RunContext
 from pydantic_ai.models.anthropic import AnthropicModelSettings
 
+# Option 1: Use anthropic_cache_all for convenience (caches system + last message)
+agent = Agent(
+    'anthropic:claude-sonnet-4-5',
+    system_prompt='Detailed instructions...',
+    model_settings=AnthropicModelSettings(
+        anthropic_cache_all=True,  # Caches both system prompt and last message
+    ),
+)
+
+# Option 2: Fine-grained control with individual settings
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Detailed instructions...',
@@ -145,3 +156,37 @@ async def main():
     print(f'Cache write tokens: {usage.cache_write_tokens}')
     print(f'Cache read tokens: {usage.cache_read_tokens}')
 ```
+
+### Cache Point Limits
+
+Anthropic enforces a maximum of 4 cache points per request. Pydantic AI automatically manages this limit:
+
+- **`anthropic_cache_all`**: Uses 2 cache points (system instructions + last message)
+- **`anthropic_cache_instructions`**: Uses 1 cache point
+- **`anthropic_cache_tool_definitions`**: Uses 1 cache point
+- **`CachePoint` markers**: Use remaining available cache points
+
+When the total exceeds 4 cache points, Pydantic AI automatically removes cache points from **older messages** (keeping the most recent ones), ensuring your requests always comply with Anthropic's limits without errors.
+
+```python {test="skip"}
+from pydantic_ai import Agent, CachePoint
+from pydantic_ai.models.anthropic import AnthropicModelSettings
+
+agent = Agent(
+    'anthropic:claude-sonnet-4-5',
+    system_prompt='Instructions...',
+    model_settings=AnthropicModelSettings(
+        anthropic_cache_all=True,  # Uses 2 cache points
+    ),
+)
+
+async def main():
+    # Even with multiple CachePoint markers, only 2 more will be kept
+    # (4 total limit - 2 from cache_all = 2 available)
+    result = await agent.run([
+        'Context 1', CachePoint(),  # Will be kept
+        'Context 2', CachePoint(),  # Will be kept
+        'Context 3', CachePoint(),  # Automatically removed (oldest)
+        'Question'
+    ])
+```
diff --git a/pydantic_ai_slim/pydantic_ai/models/anthropic.py b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
@@ -169,6 +169,22 @@ class AnthropicModelSettings(ModelSettings, total=False):
     See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
     """
 
+    anthropic_cache_all: bool | Literal['5m', '1h']
+    """Convenience setting to enable caching for both system instructions and the last user message.
+
+    When enabled, this automatically adds cache points to:
+    1. The last system prompt block (system instructions)
+    2. The last content block in the final user message
+
+    This is equivalent to setting both `anthropic_cache_instructions` and adding a cache point
+    to the last message, but more convenient for common use cases.
+    If `True`, uses TTL='5m'. You can also specify '5m' or '1h' directly.
+
+    Note: Uses 2 of Anthropic's 4 available cache points per request. Any additional CachePoint
+    markers in messages will be automatically limited to respect the 4-cache-point maximum.
+    See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
+    """
+
 
 @dataclass(init=False)
 class AnthropicModel(Model):
@@ -478,7 +494,10 @@ def _get_tools(
         ]
 
         # Add cache_control to the last tool if enabled
-        if tools and (cache_tool_defs := model_settings.get('anthropic_cache_tool_definitions')):
+        if tools and (
+            cache_tool_defs := model_settings.get('anthropic_cache_tool_definitions')
+            or model_settings.get('anthropic_cache_all')
+        ):
             # If True, use '5m'; otherwise use the specified ttl value
             ttl: Literal['5m', '1h'] = '5m' if cache_tool_defs is True else cache_tool_defs
             last_tool = tools[-1]
@@ -747,8 +766,32 @@ async def _map_message(  # noqa: C901
             system_prompt_parts.insert(0, instructions)
         system_prompt = '\n\n'.join(system_prompt_parts)
 
+        # Add cache_control to the last message content if anthropic_cache_all is enabled
+        if anthropic_messages and (cache_all := model_settings.get('anthropic_cache_all')):
+            ttl: Literal['5m', '1h'] = '5m' if cache_all is True else cache_all
+            m = anthropic_messages[-1]
+            content = m['content']
+            if isinstance(content, str):
+                # Convert string content to list format with cache_control
+                m['content'] = [
+                    {
+                        'text': content,
+                        'type': 'text',
+                        'cache_control': BetaCacheControlEphemeralParam(type='ephemeral', ttl=ttl),
+                    }
+                ]
+            else:
+                # Add cache_control to the last content block
+                content = cast(list[BetaContentBlockParam], content)
+                self._add_cache_control_to_last_param(content, ttl)
+
+        # Ensure total cache points don't exceed Anthropic's limit of 4
+        self._limit_cache_points(anthropic_messages, model_settings)
         # If anthropic_cache_instructions is enabled, return system prompt as a list with cache_control
-        if system_prompt and (cache_instructions := model_settings.get('anthropic_cache_instructions')):
+        if system_prompt and (
+            cache_instructions := model_settings.get('anthropic_cache_instructions')
+            or model_settings.get('anthropic_cache_all')
+        ):
             # If True, use '5m'; otherwise use the specified ttl value
             ttl: Literal['5m', '1h'] = '5m' if cache_instructions is True else cache_instructions
             system_prompt_blocks = [
@@ -762,6 +805,63 @@ async def _map_message(  # noqa: C901
 
         return system_prompt, anthropic_messages
 
+    @staticmethod
+    def _limit_cache_points(messages: list[BetaMessageParam], model_settings: AnthropicModelSettings) -> None:
+        """Limit the number of cache points in messages to comply with Anthropic's 4-cache-point maximum.
+
+        Anthropic allows a maximum of 4 cache points per request. This method ensures compliance by:
+        1. Calculating how many cache points are already used by system-level settings
+           (anthropic_cache_instructions, anthropic_cache_tool_definitions, anthropic_cache_all)
+        2. Determining how many cache points remain available for message-level caching
+        3. Traversing messages from newest to oldest, keeping only the allowed number of cache points
+        4. Removing cache_control from older cache points that exceed the limit
+
+        This prioritizes recent cache points, which are typically more valuable for conversation continuity.
+
+        Args:
+            messages: List of message parameters to limit cache points in.
+            model_settings: Model settings containing cache configuration.
+        """
+        # Anthropic's maximum cache points per request
+        max_cache_points = 4
+        used_cache_points = 0
+
+        # Calculate cache points used by system-level settings
+        if model_settings.get('anthropic_cache_all'):
+            # anthropic_cache_all adds cache points for both system instructions and last message
+            used_cache_points += 2
+        else:
+            if model_settings.get('anthropic_cache_instructions'):
+                used_cache_points += 1
+            if model_settings.get('anthropic_cache_tool_definitions'):
+                # Assume used one cache point for tool definitions
+                used_cache_points += 1
+
+        # Calculate remaining cache points available for message content
+        keep_cache_points = max_cache_points - used_cache_points
+
+        # Traverse messages from back to front (newest to oldest)
+        remaining_cache_points = keep_cache_points
+        for message in reversed(messages):
+            content = message['content']
+            # Skip if content is a string or None
+            if isinstance(content, str):
+                continue
+            content = cast(list[BetaContentBlockParam], content)
+            # Traverse content blocks from back to front within each message
+            for block in reversed(content):
+                # Cast to dict for TypedDict manipulation
+                block_dict = cast(dict[str, Any], block)
+
+                # Check if this block has cache_control
+                if 'cache_control' in block_dict:
+                    if remaining_cache_points > 0:
+                        # Keep this cache point (within limit)
+                        remaining_cache_points -= 1
+                    else:
+                        # Remove cache_control as we've exceeded the limit
+                        del block_dict['cache_control']
+
     @staticmethod
     def _add_cache_control_to_last_param(params: list[BetaContentBlockParam], ttl: Literal['5m', '1h'] = '5m') -> None:
         """Add cache control to the last content block param.
diff --git a/tests/models/test_anthropic.py b/tests/models/test_anthropic.py
@@ -588,6 +588,166 @@ def my_tool(value: str) -> str:  # pragma: no cover
     assert system[0]['cache_control'] == snapshot({'type': 'ephemeral', 'ttl': '5m'})
 
 
+async def test_anthropic_cache_all(allow_model_requests: None):
+    """Test that anthropic_cache_all caches both system instructions and last message."""
+    c = completion_message(
+        [BetaTextBlock(text='Response', type='text')],
+        usage=BetaUsage(input_tokens=10, output_tokens=5),
+    )
+    mock_client = MockAnthropic.create_mock(c)
+    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
+    agent = Agent(
+        m,
+        system_prompt='System instructions to cache.',
+        model_settings=AnthropicModelSettings(
+            anthropic_cache_all=True,
+        ),
+    )
+
+    await agent.run('User message')
+
+    # Verify both system and last message have cache_control
+    completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
+    system = completion_kwargs['system']
+    messages = completion_kwargs['messages']
+
+    # System should have cache_control
+    assert system == snapshot(
+        [{'type': 'text', 'text': 'System instructions to cache.', 'cache_control': {'type': 'ephemeral', 'ttl': '5m'}}]
+    )
+
+    # Last message content should have cache_control
+    assert messages[-1]['content'][-1] == snapshot(
+        {'type': 'text', 'text': 'User message', 'cache_control': {'type': 'ephemeral', 'ttl': '5m'}}
+    )
+
+
+async def test_anthropic_cache_all_with_custom_ttl(allow_model_requests: None):
+    """Test that anthropic_cache_all supports custom TTL values."""
+    c = completion_message(
+        [BetaTextBlock(text='Response', type='text')],
+        usage=BetaUsage(input_tokens=10, output_tokens=5),
+    )
+    mock_client = MockAnthropic.create_mock(c)
+    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
+    agent = Agent(
+        m,
+        system_prompt='System instructions.',
+        model_settings=AnthropicModelSettings(
+            anthropic_cache_all='1h',  # Custom 1h TTL
+        ),
+    )
+
+    await agent.run('User message')
+
+    # Verify both use 1h TTL
+    completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
+    system = completion_kwargs['system']
+    messages = completion_kwargs['messages']
+
+    assert system[0]['cache_control'] == snapshot({'type': 'ephemeral', 'ttl': '1h'})
+    assert messages[-1]['content'][-1]['cache_control'] == snapshot({'type': 'ephemeral', 'ttl': '1h'})
+
+
+async def test_limit_cache_points_with_cache_all(allow_model_requests: None):
+    """Test that cache points are limited when using cache_all + CachePoint markers."""
+    c = completion_message(
+        [BetaTextBlock(text='Response', type='text')],
+        usage=BetaUsage(input_tokens=10, output_tokens=5),
+    )
+    mock_client = MockAnthropic.create_mock(c)
+    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
+    agent = Agent(
+        m,
+        system_prompt='System instructions.',
+        model_settings=AnthropicModelSettings(
+            anthropic_cache_all=True,  # Uses 2 cache points
+        ),
+    )
+
+    # Add 3 CachePoint markers (total would be 5: 2 from cache_all + 3 from markers)
+    # Only 2 CachePoint markers should be kept (newest ones)
+    await agent.run(
+        [
+            'Context 1',
+            CachePoint(),  # Oldest, should be removed
+            'Context 2',
+            CachePoint(),  # Should be kept
+            'Context 3',
+            CachePoint(),  # Should be kept
+            'Question',
+        ]
+    )
+
+    completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
+    messages = completion_kwargs['messages']
+
+    # Count cache_control occurrences in messages
+    cache_count = 0
+    for msg in messages:
+        for block in msg['content']:
+            if 'cache_control' in block:
+                cache_count += 1
+
+    # anthropic_cache_all uses 2 cache points (system + last message)
+    # With 3 CachePoint markers, we'd have 5 total
+    # Limit is 4, so 1 oldest CachePoint should be removed
+    # Result: 2 cache points in messages (from the 2 newest CachePoints)
+    # The cache_all's last message cache is applied after limiting
+    assert cache_count == 2
+
+
+async def test_limit_cache_points_all_settings(allow_model_requests: None):
+    """Test cache point limiting with all cache settings enabled."""
+    c = completion_message(
+        [BetaTextBlock(text='Response', type='text')],
+        usage=BetaUsage(input_tokens=10, output_tokens=5),
+    )
+    mock_client = MockAnthropic.create_mock(c)
+    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
+
+    agent = Agent(
+        m,
+        system_prompt='System instructions.',
+        model_settings=AnthropicModelSettings(
+            anthropic_cache_instructions=True,  # 1 cache point
+            anthropic_cache_tool_definitions=True,  # 1 cache point
+        ),
+    )
+
+    @agent.tool_plain
+    def my_tool() -> str:  # pragma: no cover
+        return 'result'
+
+    # Add 3 CachePoint markers (total would be 5: 2 from settings + 3 from markers)
+    # Only 2 CachePoint markers should be kept
+    await agent.run(
+        [
+            'Context 1',
+            CachePoint(),  # Oldest, should be removed
+            'Context 2',
+            CachePoint(),  # Should be kept
+            'Context 3',
+            CachePoint(),  # Should be kept
+            'Question',
+        ]
+    )
+
+    completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
+    messages = completion_kwargs['messages']
+
+    # Count cache_control in messages (excluding system and tools)
+    cache_count = 0
+    for msg in messages:
+        for block in msg['content']:
+            if 'cache_control' in block:
+                cache_count += 1
+
+    # Should have exactly 2 cache points in messages
+    # (4 total - 1 system - 1 tool = 2 available for messages)
+    assert cache_count == 2
+
+
 async def test_async_request_text_response(allow_model_requests: None):
     c = completion_message(
         [BetaTextBlock(text='world', type='text')],