diff --git a/api/inference/v1alpha1/config_types.go b/api/inference/v1alpha1/config_types.go index de720569..a51a2314 100644 --- a/api/inference/v1alpha1/config_types.go +++ b/api/inference/v1alpha1/config_types.go @@ -42,7 +42,6 @@ type BackendRuntimeConfig struct { // ConfigName represents the recommended configuration name for the backend, // It will be inferred from the models in the runtime if not specified, e.g. default, // speculative-decoding. - // +kubebuilder:default=default ConfigName *string `json:"configName,omitempty"` // Args defined here will "append" the args defined in the recommendedConfig, // either explicitly configured in configName or inferred in the runtime. diff --git a/chart/templates/backends/llamacpp.yaml b/chart/templates/backends/llamacpp.yaml index fb6ae6d0..3fdada9c 100644 --- a/chart/templates/backends/llamacpp.yaml +++ b/chart/templates/backends/llamacpp.yaml @@ -49,6 +49,20 @@ spec: limits: cpu: 2 memory: 4Gi + - name: speculative-decoding + args: + - -m + - "{{`{{ .ModelPath }}`}}" + - -md + - "{{`{{ .DraftModelPath }}`}}" + - --host + - "0.0.0.0" + - --port + - "8080" + - --draft-max + - "16" + - --draft-min + - "5" startupProbe: periodSeconds: 10 failureThreshold: 30 diff --git a/config/crd/bases/inference.llmaz.io_playgrounds.yaml b/config/crd/bases/inference.llmaz.io_playgrounds.yaml index e5854951..00ec2e20 100644 --- a/config/crd/bases/inference.llmaz.io_playgrounds.yaml +++ b/config/crd/bases/inference.llmaz.io_playgrounds.yaml @@ -59,7 +59,6 @@ spec: the hood, e.g. vLLM. type: string configName: - default: default description: |- ConfigName represents the recommended configuration name for the backend, It will be inferred from the models in the runtime if not specified, e.g. default, diff --git a/docs/examples/README.md b/docs/examples/README.md index 6733d004..42a745b4 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -12,6 +12,7 @@ We provide a set of examples to help you serve large language models, by default - [Deploy models via TensorRT-LLM](#deploy-models-via-tensorrt-llm) - [Deploy models via text-generation-inference](#deploy-models-via-text-generation-inference) - [Deploy models via ollama](#deploy-models-via-ollama) +- [Speculative Decoding with llama.cpp](#speculative-decoding-with-llamacpp) - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm) - [Multi-Host Inference](#multi-host-inference) - [Deploy Host Models](#deploy-host-models) @@ -59,6 +60,10 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference [ollama](https://github.com/ollama/ollama) based on llama.cpp, aims for local deploy. see [example](./ollama/) here. +### Speculative Decoding with llama.cpp + +llama.cpp supports speculative decoding to significantly improve inference performance, see [example](./speculative-decoding/llamacpp/) here. + ### Speculative Decoding with vLLM [Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here. diff --git a/docs/examples/speculative-decoding/llamacpp/playground.yaml b/docs/examples/speculative-decoding/llamacpp/playground.yaml index a90b44b4..a02fccda 100644 --- a/docs/examples/speculative-decoding/llamacpp/playground.yaml +++ b/docs/examples/speculative-decoding/llamacpp/playground.yaml @@ -1,5 +1,5 @@ # This is just an toy example, because it doesn't make any sense -# in real world, drafting tokens for the model with similar size. +# in real world, drafting tokens for the model with smaller size. apiVersion: llmaz.io/v1alpha1 kind: OpenModel metadata: @@ -38,10 +38,3 @@ spec: backendName: llamacpp args: - -fa # use flash attention - resources: - requests: - cpu: 4 - memory: "8Gi" - limits: - cpu: 4 - memory: "8Gi" diff --git a/test/config/backends/llamacpp.yaml b/test/config/backends/llamacpp.yaml index dfd81d8e..8e3eb9c5 100644 --- a/test/config/backends/llamacpp.yaml +++ b/test/config/backends/llamacpp.yaml @@ -29,21 +29,20 @@ spec: limits: cpu: 2 memory: 4Gi - # TODO: not supported yet, see https://github.com/InftyAI/llmaz/issues/240. - # - name: speculative-decoding - # args: - # - -m - # - "{{ .ModelPath }}" - # - -md - # - "{{ .DraftModelPath }}" - # - --host - # - "0.0.0.0" - # - --port - # - "8080" - # - --draft-max - # - "16" - # - --draft-min - # - "5" + - name: speculative-decoding + args: + - -m + - "{{ .ModelPath }}" + - -md + - "{{ .DraftModelPath }}" + - --host + - "0.0.0.0" + - --port + - "8080" + - --draft-max + - "16" + - --draft-min + - "5" startupProbe: periodSeconds: 10 failureThreshold: 30 diff --git a/test/e2e/playground_test.go b/test/e2e/playground_test.go index 9e73f434..d82bd708 100644 --- a/test/e2e/playground_test.go +++ b/test/e2e/playground_test.go @@ -142,32 +142,29 @@ var _ = ginkgo.Describe("playground e2e tests", func() { hpa := &autoscalingv2.HorizontalPodAutoscaler{} gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: playground.Name, Namespace: playground.Namespace}, hpa)).To(gomega.Succeed()) }) - // TODO: add e2e tests. - // ginkgo.It("SpeculativeDecoding with llama.cpp", func() { - // targetModel := wrapper.MakeModel("llama2-7b-q8-gguf").FamilyName("llama2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q8_0.gguf", "", nil, nil).Obj() - // gomega.Expect(k8sClient.Create(ctx, targetModel)).To(gomega.Succeed()) - // defer func() { - // gomega.Expect(k8sClient.Delete(ctx, targetModel)).To(gomega.Succeed()) - // }() - // draftModel := wrapper.MakeModel("llama2-7b-q2-k-gguf").FamilyName("llama2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q2_K.gguf", "", nil, nil).Obj() - // gomega.Expect(k8sClient.Create(ctx, draftModel)).To(gomega.Succeed()) - // defer func() { - // gomega.Expect(k8sClient.Delete(ctx, draftModel)).To(gomega.Succeed()) - // }() - - // playground := wrapper.MakePlayground("llamacpp-speculator", ns.Name). - // MultiModelsClaim([]string{"llama2-7b-q8-gguf", "llama2-7b-q2-k-gguf"}, coreapi.SpeculativeDecoding). - // BackendRuntime("llamacpp").BackendLimit("cpu", "4").BackendRequest("memory", "8Gi"). - // Replicas(1). - // Obj() - // gomega.Expect(k8sClient.Create(ctx, playground)).To(gomega.Succeed()) - // validation.ValidatePlayground(ctx, k8sClient, playground) - // validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundAvailable, "PlaygroundReady", metav1.ConditionTrue) - - // service := &inferenceapi.Service{} - // gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: playground.Name, Namespace: playground.Namespace}, service)).To(gomega.Succeed()) - // validation.ValidateService(ctx, k8sClient, service) - // validation.ValidateServiceStatusEqualTo(ctx, k8sClient, service, inferenceapi.ServiceAvailable, "ServiceReady", metav1.ConditionTrue) - // validation.ValidateServicePods(ctx, k8sClient, service) - // }) + ginkgo.It("SpeculativeDecoding with llama.cpp", func() { + targetModel := wrapper.MakeModel("llama2-7b-q8-gguf").FamilyName("llama2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q8_0.gguf", "", nil, nil).Obj() + gomega.Expect(k8sClient.Create(ctx, targetModel)).To(gomega.Succeed()) + defer func() { + gomega.Expect(k8sClient.Delete(ctx, targetModel)).To(gomega.Succeed()) + }() + draftModel := wrapper.MakeModel("llama2-7b-q2-k-gguf").FamilyName("llama2").ModelSourceWithModelHub("Huggingface").ModelSourceWithModelID("TheBloke/Llama-2-7B-GGUF", "llama-2-7b.Q2_K.gguf", "", nil, nil).Obj() + gomega.Expect(k8sClient.Create(ctx, draftModel)).To(gomega.Succeed()) + defer func() { + gomega.Expect(k8sClient.Delete(ctx, draftModel)).To(gomega.Succeed()) + }() + + playground := wrapper.MakePlayground("llamacpp-speculator", ns.Name). + ModelClaims([]string{"llama2-7b-q8-gguf", "llama2-7b-q2-k-gguf"}, []string{"main", "draft"}). + BackendRuntime("llamacpp").Replicas(1).Obj() + gomega.Expect(k8sClient.Create(ctx, playground)).To(gomega.Succeed()) + validation.ValidatePlayground(ctx, k8sClient, playground) + validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundAvailable, "PlaygroundReady", metav1.ConditionTrue) + + service := &inferenceapi.Service{} + gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: playground.Name, Namespace: playground.Namespace}, service)).To(gomega.Succeed()) + validation.ValidateService(ctx, k8sClient, service) + validation.ValidateServiceStatusEqualTo(ctx, k8sClient, service, inferenceapi.ServiceAvailable, "ServiceReady", metav1.ConditionTrue) + validation.ValidateServicePods(ctx, k8sClient, service) + }) })