Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions pkg/epp/config/loader/configloader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func TestLoadRawConfiguration(t *testing.T) {
},
{
Type: test2Type,
Parameters: json.RawMessage("{\"hashBlockSize\":32}"),
Parameters: json.RawMessage("{\"blockSize\":32}"),
},
{
Name: "testPicker",
Expand Down Expand Up @@ -175,7 +175,7 @@ func TestLoadRawConfigurationWithDefaults(t *testing.T) {
{
Name: test2Type,
Type: test2Type,
Parameters: json.RawMessage("{\"hashBlockSize\":32}"),
Parameters: json.RawMessage("{\"blockSize\":32}"),
},
{
Name: "testPicker",
Expand Down Expand Up @@ -464,7 +464,7 @@ plugins:
type: test-profile-handler
- type: test-two
parameters:
hashBlockSize: 32
blockSize: 32
- name: testPicker
type: test-picker
schedulingProfiles:
Expand Down Expand Up @@ -767,7 +767,7 @@ plugins:
- name: prefixCacheScorer
type: prefix-cache-scorer
parameters:
hashBlockSize: 32
blockSize: 32
- name: maxScorePicker
type: max-score-picker
- name: profileHandler
Expand All @@ -792,7 +792,7 @@ plugins:
- name: prefixCacheScorer
type: prefix-cache-scorer
parameters:
hashBlockSize: 32
blockSize: 32
schedulingProfiles:
- name: default
plugins:
Expand Down Expand Up @@ -826,7 +826,7 @@ plugins:
- name: prefixCacheScorer
type: prefix-cache-scorer
parameters:
hashBlockSize: asdf
blockSize: asdf
schedulingProfiles:
- name: default
plugins:
Expand Down
14 changes: 7 additions & 7 deletions pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import (

const (
// vLLM default token block size is 16, and a good guess of average characters per token is 4.
DefaultHashBlockSize = 64
DefaultBlockSize = 64
// The maximum number of blocks to match. Two long requests with the same prefix up to this
// limit will be indistinguishable.
// This parameter provides a trade-off between cache size, prefix matching speed and matching
Expand All @@ -58,15 +58,15 @@ const (
)

var DefaultConfig = Config{
HashBlockSize: DefaultHashBlockSize,
BlockSize: DefaultBlockSize,
MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
LRUCapacityPerServer: DefaultLRUCapacityPerServer,
}

type Config struct {
// The input prompt is broken into sizes of HashBlockSize to calculate block hashes . Requests
// The input prompt is broken into sizes of BlockSize to calculate block hashes . Requests
// with length shorter than the block size will be ignored.
HashBlockSize int `json:"hashBlockSize"`
BlockSize int `json:"blockSize"`
// MaxPrefixBlocksToMatch is the maximum number of prefix blocks to match. Input beyond this limit will
// be ignored.
MaxPrefixBlocksToMatch int `json:"maxPrefixBlocksToMatch"`
Expand Down Expand Up @@ -133,7 +133,7 @@ var (
// PrefixCachePluginFactory defines the factory function for Prefix plugin.
func PrefixCachePluginFactory(name string, rawParameters json.RawMessage, handle plugins.Handle) (plugins.Plugin, error) {
parameters := Config{
HashBlockSize: DefaultHashBlockSize,
BlockSize: DefaultBlockSize,
MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
LRUCapacityPerServer: DefaultLRUCapacityPerServer,
}
Expand Down Expand Up @@ -180,7 +180,7 @@ func (p *Plugin) WithName(name string) *Plugin {
// Score returns the scoring result for the given list of pods based on context.
func (p *Plugin) Score(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
// pre score step, hashing prompt and find longest prefix match.
hashes := hashPrompt(ctx, request, p.config.HashBlockSize, p.config.MaxPrefixBlocksToMatch)
hashes := hashPrompt(ctx, request, p.config.BlockSize, p.config.MaxPrefixBlocksToMatch)
state := &SchedulingContextState{
PrefixHashes: hashes,
PrefixCacheServers: p.matchLongestPrefix(ctx, hashes),
Expand Down Expand Up @@ -231,7 +231,7 @@ func (p *Plugin) PreRequest(ctx context.Context, request *types.LLMRequest, sche

total := len(state.PrefixHashes)
matchLen := state.PrefixCacheServers[ServerID(targetPod.NamespacedName)]
metrics.RecordPrefixCacheMatch(matchLen*p.config.HashBlockSize, total*p.config.HashBlockSize)
metrics.RecordPrefixCacheMatch(matchLen*p.config.BlockSize, total*p.config.BlockSize)
}

// matchLongestPrefix returns a map of servers and length of prefix that each server caches.
Expand Down
10 changes: 5 additions & 5 deletions pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import (

func TestPrefixPluginCompletion(t *testing.T) {
config := Config{
HashBlockSize: 4,
BlockSize: 4,
MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
LRUCapacityPerServer: DefaultLRUCapacityPerServer,
}
Expand Down Expand Up @@ -201,7 +201,7 @@ func TestPrefixPluginCompletion(t *testing.T) {

func TestPrefixPluginChatCompletions(t *testing.T) {
config := Config{
HashBlockSize: 4,
BlockSize: 4,
MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
LRUCapacityPerServer: DefaultLRUCapacityPerServer,
}
Expand Down Expand Up @@ -235,7 +235,7 @@ func TestPrefixPluginChatCompletions(t *testing.T) {

func TestPrefixPluginChatCompletionsGrowth(t *testing.T) {
config := Config{
HashBlockSize: 8, // Use larger block size for more predictable JSON marshaling
BlockSize: 8, // Use larger block size for more predictable JSON marshaling
MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks,
LRUCapacityPerServer: DefaultLRUCapacityPerServer,
}
Expand Down Expand Up @@ -347,7 +347,7 @@ func BenchmarkPrefixPluginStress(b *testing.B) {
blockSize := 4
maxPrefixBlocks := 50000
config := Config{
HashBlockSize: blockSize,
BlockSize: blockSize,
MaxPrefixBlocksToMatch: maxPrefixBlocks,
LRUCapacityPerServer: DefaultLRUCapacityPerServer,
}
Expand Down Expand Up @@ -416,7 +416,7 @@ func BenchmarkPrefixPluginChatCompletionsStress(b *testing.B) {
blockSize := 8
maxPrefixBlocks := 50000
config := Config{
HashBlockSize: blockSize,
BlockSize: blockSize,
MaxPrefixBlocksToMatch: maxPrefixBlocks,
LRUCapacityPerServer: DefaultLRUCapacityPerServer,
}
Expand Down
8 changes: 4 additions & 4 deletions site-src/guides/epp-configuration/config-text.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ kind: EndpointPickerConfig
plugins:
- type: prefix-cache-scorer
parameters:
hashBlockSize: 5
blockSize: 5
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
schedulingProfiles:
Expand Down Expand Up @@ -158,7 +158,7 @@ spec:
plugins:
- type: prefix-cache-scorer
parameters:
hashBlockSize: 5
blockSize: 5
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
schedulingProfiles:
Expand All @@ -177,7 +177,7 @@ kind: EndpointPickerConfig
plugins:
- type: prefix-cache-scorer
parameters:
hashBlockSize: 5
blockSize: 5
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
- type: single-profile-handler
Expand Down Expand Up @@ -207,7 +207,7 @@ Scores pods based on the amount of the prompt is believed to be in the pod's KvC

- *Type*: prefix-cache-scorer
- *Parameters*:
- `hashBlockSize` specified the size of the blocks to break up the input prompt when
- `blockSize` specified the size of the blocks to break up the input prompt when
calculating the block hashes. If not specified defaults to `64`
- `maxPrefixBlocksToMatch` specifies the maximum number of prefix blocks to match. If
not specified defaults to `256`
Expand Down
2 changes: 1 addition & 1 deletion site-src/guides/epp-configuration/prefix-aware.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Like any other plugins, the prefix cache aware plugin can be enabled/disabled vi

The prefix cache plugin exposes the following advanced configuration parameters:

* `hashBlockSize`: The plugin matches prefixes in the unit of blocks. This is the size
* `blockSize`: The plugin matches prefixes in the unit of blocks. This is the size
of each block in number of bytes. vLLM default block size is 16 tokens. Assume 4 characters per token, the default
is set to 64 in EPP. The default is recommended unless performance is critical for use cases with
extremely long inputs.
Expand Down
2 changes: 1 addition & 1 deletion test/testdata/configloader_1_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ plugins:
type: test-profile-handler
- type: test-two
parameters:
hashBlockSize: 32
blockSize: 32
- name: testPicker
type: test-picker

Expand Down