diff --git a/pkg/epp/config/loader/configloader_test.go b/pkg/epp/config/loader/configloader_test.go index ff7b65256..b0b0741c1 100644 --- a/pkg/epp/config/loader/configloader_test.go +++ b/pkg/epp/config/loader/configloader_test.go @@ -73,7 +73,7 @@ func TestLoadRawConfiguration(t *testing.T) { }, { Type: test2Type, - Parameters: json.RawMessage("{\"hashBlockSize\":32}"), + Parameters: json.RawMessage("{\"blockSize\":32}"), }, { Name: "testPicker", @@ -175,7 +175,7 @@ func TestLoadRawConfigurationWithDefaults(t *testing.T) { { Name: test2Type, Type: test2Type, - Parameters: json.RawMessage("{\"hashBlockSize\":32}"), + Parameters: json.RawMessage("{\"blockSize\":32}"), }, { Name: "testPicker", @@ -464,7 +464,7 @@ plugins: type: test-profile-handler - type: test-two parameters: - hashBlockSize: 32 + blockSize: 32 - name: testPicker type: test-picker schedulingProfiles: @@ -767,7 +767,7 @@ plugins: - name: prefixCacheScorer type: prefix-cache-scorer parameters: - hashBlockSize: 32 + blockSize: 32 - name: maxScorePicker type: max-score-picker - name: profileHandler @@ -792,7 +792,7 @@ plugins: - name: prefixCacheScorer type: prefix-cache-scorer parameters: - hashBlockSize: 32 + blockSize: 32 schedulingProfiles: - name: default plugins: @@ -826,7 +826,7 @@ plugins: - name: prefixCacheScorer type: prefix-cache-scorer parameters: - hashBlockSize: asdf + blockSize: asdf schedulingProfiles: - name: default plugins: diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go index 40e88062a..6bc81a44a 100644 --- a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin.go @@ -37,7 +37,7 @@ import ( const ( // vLLM default token block size is 16, and a good guess of average characters per token is 4. - DefaultHashBlockSize = 64 + DefaultBlockSize = 64 // The maximum number of blocks to match. Two long requests with the same prefix up to this // limit will be indistinguishable. // This parameter provides a trade-off between cache size, prefix matching speed and matching @@ -58,15 +58,15 @@ const ( ) var DefaultConfig = Config{ - HashBlockSize: DefaultHashBlockSize, + BlockSize: DefaultBlockSize, MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks, LRUCapacityPerServer: DefaultLRUCapacityPerServer, } type Config struct { - // The input prompt is broken into sizes of HashBlockSize to calculate block hashes . Requests + // The input prompt is broken into sizes of BlockSize to calculate block hashes . Requests // with length shorter than the block size will be ignored. - HashBlockSize int `json:"hashBlockSize"` + BlockSize int `json:"blockSize"` // MaxPrefixBlocksToMatch is the maximum number of prefix blocks to match. Input beyond this limit will // be ignored. MaxPrefixBlocksToMatch int `json:"maxPrefixBlocksToMatch"` @@ -133,7 +133,7 @@ var ( // PrefixCachePluginFactory defines the factory function for Prefix plugin. func PrefixCachePluginFactory(name string, rawParameters json.RawMessage, handle plugins.Handle) (plugins.Plugin, error) { parameters := Config{ - HashBlockSize: DefaultHashBlockSize, + BlockSize: DefaultBlockSize, MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks, LRUCapacityPerServer: DefaultLRUCapacityPerServer, } @@ -180,7 +180,7 @@ func (p *Plugin) WithName(name string) *Plugin { // Score returns the scoring result for the given list of pods based on context. func (p *Plugin) Score(ctx context.Context, cycleState *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 { // pre score step, hashing prompt and find longest prefix match. - hashes := hashPrompt(ctx, request, p.config.HashBlockSize, p.config.MaxPrefixBlocksToMatch) + hashes := hashPrompt(ctx, request, p.config.BlockSize, p.config.MaxPrefixBlocksToMatch) state := &SchedulingContextState{ PrefixHashes: hashes, PrefixCacheServers: p.matchLongestPrefix(ctx, hashes), @@ -231,7 +231,7 @@ func (p *Plugin) PreRequest(ctx context.Context, request *types.LLMRequest, sche total := len(state.PrefixHashes) matchLen := state.PrefixCacheServers[ServerID(targetPod.NamespacedName)] - metrics.RecordPrefixCacheMatch(matchLen*p.config.HashBlockSize, total*p.config.HashBlockSize) + metrics.RecordPrefixCacheMatch(matchLen*p.config.BlockSize, total*p.config.BlockSize) } // matchLongestPrefix returns a map of servers and length of prefix that each server caches. diff --git a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go index 9f9893ba8..cfcf99d0f 100644 --- a/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go +++ b/pkg/epp/scheduling/framework/plugins/multi/prefix/plugin_test.go @@ -35,7 +35,7 @@ import ( func TestPrefixPluginCompletion(t *testing.T) { config := Config{ - HashBlockSize: 4, + BlockSize: 4, MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks, LRUCapacityPerServer: DefaultLRUCapacityPerServer, } @@ -201,7 +201,7 @@ func TestPrefixPluginCompletion(t *testing.T) { func TestPrefixPluginChatCompletions(t *testing.T) { config := Config{ - HashBlockSize: 4, + BlockSize: 4, MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks, LRUCapacityPerServer: DefaultLRUCapacityPerServer, } @@ -235,7 +235,7 @@ func TestPrefixPluginChatCompletions(t *testing.T) { func TestPrefixPluginChatCompletionsGrowth(t *testing.T) { config := Config{ - HashBlockSize: 8, // Use larger block size for more predictable JSON marshaling + BlockSize: 8, // Use larger block size for more predictable JSON marshaling MaxPrefixBlocksToMatch: DefaultMaxPrefixBlocks, LRUCapacityPerServer: DefaultLRUCapacityPerServer, } @@ -347,7 +347,7 @@ func BenchmarkPrefixPluginStress(b *testing.B) { blockSize := 4 maxPrefixBlocks := 50000 config := Config{ - HashBlockSize: blockSize, + BlockSize: blockSize, MaxPrefixBlocksToMatch: maxPrefixBlocks, LRUCapacityPerServer: DefaultLRUCapacityPerServer, } @@ -416,7 +416,7 @@ func BenchmarkPrefixPluginChatCompletionsStress(b *testing.B) { blockSize := 8 maxPrefixBlocks := 50000 config := Config{ - HashBlockSize: blockSize, + BlockSize: blockSize, MaxPrefixBlocksToMatch: maxPrefixBlocks, LRUCapacityPerServer: DefaultLRUCapacityPerServer, } diff --git a/site-src/guides/epp-configuration/config-text.md b/site-src/guides/epp-configuration/config-text.md index 6df19db80..16ecdbb95 100644 --- a/site-src/guides/epp-configuration/config-text.md +++ b/site-src/guides/epp-configuration/config-text.md @@ -91,7 +91,7 @@ kind: EndpointPickerConfig plugins: - type: prefix-cache-scorer parameters: - hashBlockSize: 5 + blockSize: 5 maxPrefixBlocksToMatch: 256 lruCapacityPerServer: 31250 schedulingProfiles: @@ -158,7 +158,7 @@ spec: plugins: - type: prefix-cache-scorer parameters: - hashBlockSize: 5 + blockSize: 5 maxPrefixBlocksToMatch: 256 lruCapacityPerServer: 31250 schedulingProfiles: @@ -177,7 +177,7 @@ kind: EndpointPickerConfig plugins: - type: prefix-cache-scorer parameters: - hashBlockSize: 5 + blockSize: 5 maxPrefixBlocksToMatch: 256 lruCapacityPerServer: 31250 - type: single-profile-handler @@ -207,7 +207,7 @@ Scores pods based on the amount of the prompt is believed to be in the pod's KvC - *Type*: prefix-cache-scorer - *Parameters*: - - `hashBlockSize` specified the size of the blocks to break up the input prompt when + - `blockSize` specified the size of the blocks to break up the input prompt when calculating the block hashes. If not specified defaults to `64` - `maxPrefixBlocksToMatch` specifies the maximum number of prefix blocks to match. If not specified defaults to `256` diff --git a/site-src/guides/epp-configuration/prefix-aware.md b/site-src/guides/epp-configuration/prefix-aware.md index 9c1074be9..88573c466 100644 --- a/site-src/guides/epp-configuration/prefix-aware.md +++ b/site-src/guides/epp-configuration/prefix-aware.md @@ -14,7 +14,7 @@ Like any other plugins, the prefix cache aware plugin can be enabled/disabled vi The prefix cache plugin exposes the following advanced configuration parameters: -* `hashBlockSize`: The plugin matches prefixes in the unit of blocks. This is the size +* `blockSize`: The plugin matches prefixes in the unit of blocks. This is the size of each block in number of bytes. vLLM default block size is 16 tokens. Assume 4 characters per token, the default is set to 64 in EPP. The default is recommended unless performance is critical for use cases with extremely long inputs. diff --git a/test/testdata/configloader_1_test.yaml b/test/testdata/configloader_1_test.yaml index f1f167efb..db75a4265 100644 --- a/test/testdata/configloader_1_test.yaml +++ b/test/testdata/configloader_1_test.yaml @@ -9,7 +9,7 @@ plugins: type: test-profile-handler - type: test-two parameters: - hashBlockSize: 32 + blockSize: 32 - name: testPicker type: test-picker