From 5fabf10a4ba37599b4e8f8b103d8f5ed6870dbdd Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Tue, 3 Jun 2025 21:16:13 -0400 Subject: [PATCH 01/14] replace calls in sampleTokenFromLogits with GPU kernels --- src/llm_chat.ts | 365 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 310 insertions(+), 55 deletions(-) diff --git a/src/llm_chat.ts b/src/llm_chat.ts index 5f8ecf00..8f052fab 100644 --- a/src/llm_chat.ts +++ b/src/llm_chat.ts @@ -50,6 +50,12 @@ export class LLMChatPipeline { private image_embed: tvmjs.PackedFunc | undefined; private embed: tvmjs.PackedFunc; private fapplyBitmask: tvmjs.PackedFunc; + private fapplyPenalty: tvmjs.PackedFunc; + private fapplyLogitBias: tvmjs.PackedFunc; + private fsoftmaxWithTemperature: tvmjs.PackedFunc; + // private frenormalizeByTopP: tvmjs.PackedFunc; //BatchRenormalizeProbsByTopP + // private //BatchSampleTokensImpl, ChunkSampleTokensImpl + // Functions related to PagedKVCache private fclearKVCaches: tvmjs.PackedFunc; private fKVCacheAddSequence: tvmjs.PackedFunc; @@ -62,6 +68,7 @@ export class LLMChatPipeline { private params: tvmjs.TVMObject; private kvCache: tvmjs.TVMObject; private logitsOnCPU?: tvmjs.NDArray = undefined; + private logitsOnCPUCopy?: tvmjs.NDArray = undefined; private filledKVCacheLength = 0; // meta data @@ -190,6 +197,15 @@ export class LLMChatPipeline { this.fapplyBitmask = this.tvm.detachFromCurrentScope( this.vm.getFunction("apply_bitmask_inplace"), ); + this.fapplyPenalty = this.tvm.detachFromCurrentScope( + this.vm.getFunction("apply_penalty_inplace"), + ); + this.fapplyLogitBias = this.tvm.detachFromCurrentScope( + this.vm.getFunction("apply_logit_bias_inplace"), + ); + this.fsoftmaxWithTemperature = this.tvm.detachFromCurrentScope( + this.vm.getFunction("softmax_with_temperature"), + ); try { this.image_embed = this.tvm.detachFromCurrentScope( this.vm.getFunction("image_embed"), @@ -302,6 +318,7 @@ export class LLMChatPipeline { this.kvCache.dispose(); this.fclearKVCaches.dispose(); this.logitsOnCPU?.dispose(); + this.logitsOnCPUCopy?.dispose(); this.tvm.dispose(); this.tokenizer.dispose(); this.xgTokenizerInfo?.dispose(); @@ -957,6 +974,20 @@ export class LLMChatPipeline { return this.logitsOnCPU; } + private updateLogitsOnCPUCopy(logits: tvmjs.NDArray): tvmjs.NDArray { + if (this.logitsOnCPUCopy == undefined) { + this.logitsOnCPUCopy = this.tvm.detachFromCurrentScope( + this.tvm.empty(logits.shape, logits.dtype, this.tvm.cpu()), + ); + } else { + if (logits.shape[0] != this.logitsOnCPUCopy.shape[0]) { + throw Error("We expect the size of logits to remain unchanged"); + } + } + this.logitsOnCPUCopy.copyFrom(logits); + return this.logitsOnCPUCopy; + } + private async sampleTokenFromLogits( logitsOnGPU: tvmjs.NDArray, genConfig?: GenerationConfig, @@ -1091,68 +1122,221 @@ export class LLMChatPipeline { if (this.logitProcessor !== undefined) { logitsOnCPUArray = this.logitProcessor.processLogits(logitsOnCPUArray); } + if (_hasValue(logit_bias)) { - for (const tokenID in logit_bias) { - const curBias = logit_bias[tokenID]; - const curTokenID = parseInt(tokenID); - if (curTokenID > vocab_size) { - throw Error( - "Token " + - curTokenID + - " in logit_bias exceeds vocab_size " + - vocab_size, - ); - } - logitsOnCPUArray[curTokenID] += curBias; + this.tvm.beginScope(); + const numTokens = Object.keys(logit_bias ?? {}).length; + const pos2seq_id = new Int32Array(numTokens); + const tokenIds = new Int32Array(numTokens); + const tokenLogitBias = new Float32Array(numTokens); + + for (let index = 0; index < numTokens; index++) { + pos2seq_id[index] = 0; + tokenIds[index] = parseInt(Object.keys(logit_bias ?? {})[index]); + tokenLogitBias[index] = logit_bias![tokenIds[index]]; } + + const pos2seqIdsArray = this.tvm + .empty([numTokens], "int32", this.device) + .copyFrom(pos2seq_id); + + const tokenIdsArray = this.tvm + .empty([numTokens], "int32", this.device) + .copyFrom(tokenIds); + + const tokenLogitBiasArray = this.tvm + .empty([numTokens], "float32", this.device) + .copyFrom(tokenLogitBias); + + const logitsOnGPU = this.tvm + .empty([1, this.fullVocabSize], "float32", this.device) + .copyFrom(logitsOnCPUArray); + + this.fapplyLogitBias( + logitsOnGPU, + pos2seqIdsArray, + tokenIdsArray, + tokenLogitBiasArray, + ); + this.updateLogitsOnCPU(logitsOnGPU); + this.tvm.endScope(); } - this.logitsOnCPU.copyFrom(logitsOnCPUArray); + await this.device.sync(); + // console.log("After applying logit bias (GPU):", this.logitsOnCPUCopy?.toArray().slice(0, 20)); + + // if (_hasValue(logit_bias)) { + // for (const tokenID in logit_bias) { + // const curBias = logit_bias[tokenID]; + // const curTokenID = parseInt(tokenID); + // if (curTokenID > vocab_size) { + // throw Error( + // "Token " + + // curTokenID + + // " in logit_bias exceeds vocab_size " + + // vocab_size, + // ); + // } + // logitsOnCPUArray[curTokenID] += curBias; + // } + // } + // this.logitsOnCPU.copyFrom(logitsOnCPUArray); + // console.log("After applying logit bias (CPU):", this.logitsOnCPU.toArray().slice(0, 20)); } + // if (JSON.stringify(this.logitsOnCPUCopy?.toArray()) !== JSON.stringify(this.logitsOnCPU.toArray())) { + // throw new Error("Logits on CPU and GPU do not match"); + // } + + // console.log("Penalties:", { + // frequency_penalty, + // presence_penalty, + // repetition_penalty, + // }); + // 3. Apply penalties to logits - if (_hasValue(frequency_penalty) && _hasValue(presence_penalty)) { - // 3.1. Use frequency and presence penalty + if ( + frequency_penalty != 0.0 || + presence_penalty != 0.0 || + repetition_penalty != 1.0 + ) { this.tvm.beginScope(); - // Both `keys()` and `values()` are in insertion order. const appearedTokens = [...this.appearedTokensFreq.keys()]; const appearedTokensFreqs = [...this.appearedTokensFreq.values()]; - const appeared_tokens_ndarray = this.tvm.empty( - [1, appearedTokens.length], - "int32", - this.tvm.cpu(), - ); - const appeared_tokens_freqs_ndarray = this.tvm.empty( - [1, appearedTokensFreqs.length], - "int32", - this.tvm.cpu(), - ); - appeared_tokens_ndarray.copyFrom(appearedTokens); - appeared_tokens_freqs_ndarray.copyFrom(appearedTokensFreqs); - this.tvm.applyPresenceAndFrequencyPenalty( - this.logitsOnCPU, - appeared_tokens_ndarray, - appeared_tokens_freqs_ndarray, - presence_penalty!, - frequency_penalty!, - ); - this.tvm.endScope(); - } else if (repetition_penalty != 1.0) { - // 3.2. Use repetition penalty - this.tvm.beginScope(); - const appearedTokens = [...this.appearedTokensFreq.keys()]; - const appeared_tokens_ndarray = this.tvm.empty( - [1, appearedTokens.length], - "int32", - this.tvm.cpu(), - ); - appeared_tokens_ndarray.copyFrom(appearedTokens); - this.tvm.applyRepetitionPenalty( - this.logitsOnCPU, - appeared_tokens_ndarray, + + const numTokens = appearedTokens.length; + // const paddedNumTokens = Math.ceil(numTokens / 4) * 4; + + const seqIdsArray = this.tvm + .empty([1], "int32", this.device) + .copyFrom([0]); + + const pos2seq_id = new Int32Array(numTokens).fill(0); + const tokenIds = new Int32Array(numTokens).fill(0); + const tokenCnt = new Int32Array(numTokens).fill(0); + const penalties = new Float32Array([ + presence_penalty, + frequency_penalty, repetition_penalty, - ); + ]); + const paddedPenalties = new Float32Array(3); + paddedPenalties.set(penalties); + + for (let index = 0; index < numTokens; index++) { + pos2seq_id[index] = 0; + tokenIds[index] = appearedTokens[index]; + tokenCnt[index] = appearedTokensFreqs[index]; + } + + const pos2seqIdsArray = this.tvm + .empty([numTokens], "int32", this.device) + .copyFrom(pos2seq_id); + + const tokenIdsArray = this.tvm + .empty([numTokens], "int32", this.device) + .copyFrom(tokenIds); + + const tokenCntArray = this.tvm + .empty([numTokens], "int32", this.device) + .copyFrom(tokenCnt); + + const penaltiesArray = this.tvm + .empty([1, 3], "float32", this.device) + .copyFrom(paddedPenalties); + + const logitsOnGPU = this.tvm + .empty([1, this.fullVocabSize], "float32", this.device) + .copyFrom(this.logitsOnCPU.toArray()); + + // console.log("logitsOnGPU shape:", logitsOnGPU.shape); + // console.log("seqIdsArray shape:", seqIdsArray.shape); + // console.log("pos2seqIdsArray shape:", pos2seqIdsArray.shape); + // console.log("tokenIdsArray shape:", tokenIdsArray.shape); + // console.log("tokenCntArray shape:", tokenCntArray.shape); + // console.log("penaltiesArray shape:", penaltiesArray.shape); + + if (numTokens > 0) { + this.fapplyPenalty( + logitsOnGPU, + seqIdsArray, + pos2seqIdsArray, + tokenIdsArray, + tokenCntArray, + penaltiesArray, + ); + } + this.updateLogitsOnCPU(logitsOnGPU); this.tvm.endScope(); } + await this.device.sync(); + // console.log("After applying penalties (GPU):", this.logitsOnCPUCopy?.toArray().slice(0, 20)); + + // if (_hasValue(frequency_penalty) && _hasValue(presence_penalty)) { + // // 3.1. Use frequency and presence penalty + // this.tvm.beginScope(); + // // Both `keys()` and `values()` are in insertion order. + // const appearedTokens = [...this.appearedTokensFreq.keys()]; + // const appearedTokensFreqs = [...this.appearedTokensFreq.values()]; + // const appeared_tokens_ndarray = this.tvm.empty( + // [1, appearedTokens.length], + // "int32", + // this.tvm.cpu(), + // ); + // const appeared_tokens_freqs_ndarray = this.tvm.empty( + // [1, appearedTokensFreqs.length], + // "int32", + // this.tvm.cpu(), + // ); + // appeared_tokens_ndarray.copyFrom(appearedTokens); + // appeared_tokens_freqs_ndarray.copyFrom(appearedTokensFreqs); + // // let logitsOnCPUBefore = this.logitsOnCPU.toArray(); + // this.tvm.applyPresenceAndFrequencyPenalty( + // this.logitsOnCPU, + // appeared_tokens_ndarray, + // appeared_tokens_freqs_ndarray, + // presence_penalty!, + // frequency_penalty!, + // ); + // // if ( + // // JSON.stringify(logitsOnCPUBefore) === + // // JSON.stringify(this.logitsOnCPU.toArray()) + // // ) { + // // console.log("No penalty applied"); + // // } + // this.tvm.endScope(); + // } else if (repetition_penalty != 1.0) { + // // 3.2. Use repetition penalty + // this.tvm.beginScope(); + // const appearedTokens = [...this.appearedTokensFreq.keys()]; + // const appeared_tokens_ndarray = this.tvm.empty( + // [1, appearedTokens.length], + // "int32", + // this.tvm.cpu(), + // ); + // appeared_tokens_ndarray.copyFrom(appearedTokens); + // this.tvm.applyRepetitionPenalty( + // this.logitsOnCPU, + // appeared_tokens_ndarray, + // repetition_penalty, + // ); + // this.tvm.endScope(); + // } + // // console.log("After applying penalties (CPU):", this.logitsOnCPU.toArray().slice(0, 20)); + + // if (this.logitsOnCPUCopy) { + // console.log("logitsOnCPUCopy shape:", this.logitsOnCPUCopy.shape); + // const logitsOnCPUArray = this.logitsOnCPU.toArray(); + // const logitsOnCPUCopyArray = this.logitsOnCPUCopy.toArray(); + // let flag = true; + // for (let i = 0; i < logitsOnCPUArray.length; i++) { + // if (Math.abs(logitsOnCPUArray[i] - logitsOnCPUCopyArray[i]) > 1e-6) { + // console.error(`Mismatch at index ${i}: CPU=${logitsOnCPUArray[i]}, GPU=${logitsOnCPUCopyArray[i]}`); + // flag = false; + // } + // } + // if (!flag) { + // throw new Error("Logits on CPU and GPU do not match within tolerance"); + // } + // } // 4. Sample token from logits // If logprobs, need the actual distribution via softmax, otherwise directly sample from logits @@ -1160,11 +1344,82 @@ export class LLMChatPipeline { if (logprobs) { // Inplace transform logitsOnCPU to a distribution temperature = Math.max(1e-6, temperature); // to prevent division by zero - this.tvm.applySoftmaxWithTemperature(this.logitsOnCPU, temperature); - sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU, top_p); - this.tokenLogprobArray.push( - this.getTokenLogprob(sampledToken, top_logprobs!), - ); + + if (this.logitsOnCPU.shape[2] !== this.fullVocabSize) { + throw new Error("Logits vocab size does not match full vocab size"); + } + + // this.tvm.beginScope(); + // const testArray = new Float32Array(this.fullVocabSize); + // for (let i = 0; i < this.fullVocabSize; i++) { + // testArray[i] = Math.random() * 10; + // } + // console.log("Test array:", testArray.slice(0, 20)); + // const detachedTestArrayOnGPU = this.tvm.detachFromCurrentScope( + // this.tvm.empty([1, 1, this.fullVocabSize], "float32", this.device).copyFrom(testArray) + // ); + // await this.device.sync(); + // const detachedTestArrayOnCPU = this.tvm.detachFromCurrentScope( + // this.tvm.empty([testArray.length], "float32", this.tvm.cpu()).copyFrom(testArray) + // ); + // this.tvm.endScope(); + + const numSeqs = 1; + const numTokens = this.appearedTokensFreq.size; + + if (numTokens > 0) { + const temperatures = new Float32Array([temperature]); + + this.tvm.beginScope(); + const temperaturesArray = this.tvm + .empty([numSeqs], "float32", this.device) + .copyFrom(temperatures); + + const logitsOnGPU = this.tvm + .empty([numSeqs, 1, this.fullVocabSize], "float32", this.device) + .copyFrom(this.logitsOnCPU.toArray()); + + // const detachedTestArrayOnGPUCopy = this.tvm + // .empty(detachedTestArrayOnGPU.shape, detachedTestArrayOnGPU.dtype, this.tvm.cpu()) + // .copyFrom(detachedTestArrayOnGPU); + + // await this.device.sync(); + + const probs = this.fsoftmaxWithTemperature( + logitsOnGPU, + temperaturesArray, + ); + this.updateLogitsOnCPU(probs); + this.tvm.endScope(); + await this.device.sync(); + + sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU, top_p); + this.tokenLogprobArray.push( + this.getTokenLogprob(sampledToken, top_logprobs!), + ); + } else { + this.tvm.applySoftmaxWithTemperature(this.logitsOnCPU, temperature); + sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU, top_p); + this.tokenLogprobArray.push( + this.getTokenLogprob(sampledToken, top_logprobs!), + ); + } + + // if (numTokens > 0 && this.logitsOnCPUCopy) { + // console.log("logitsOnCPUCopy shape:", this.logitsOnCPUCopy.shape); + // const logitsOnCPUArray = this.logitsOnCPU.toArray(); + // const logitsOnCPUCopyArray = this.logitsOnCPUCopy.toArray(); + // let flag = true; + // for (let i = 0; i < logitsOnCPUArray.length; i++) { + // if (Math.abs(logitsOnCPUArray[i] - logitsOnCPUCopyArray[i]) > 1e-6) { + // console.error(`Mismatch at index ${i}: CPU=${logitsOnCPUArray[i]}, GPU=${logitsOnCPUCopyArray[i]}`); + // flag = false; + // } + // } + // if (!flag) { + // throw new Error("Logits on CPU and GPU do not match within tolerance"); + // } + // } } else { // temperature being 0 is allowed here, equivalent to argmax sampledToken = this.tvm.sampleTopPFromLogits( From 2777057780b16610eb11cf24a6f9cb1bc0958d64 Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Tue, 3 Jun 2025 21:28:52 -0400 Subject: [PATCH 02/14] remove debugging code --- src/llm_chat.ts | 147 ------------------------------------------------ 1 file changed, 147 deletions(-) diff --git a/src/llm_chat.ts b/src/llm_chat.ts index 8f052fab..3b3be448 100644 --- a/src/llm_chat.ts +++ b/src/llm_chat.ts @@ -1162,37 +1162,8 @@ export class LLMChatPipeline { this.tvm.endScope(); } await this.device.sync(); - // console.log("After applying logit bias (GPU):", this.logitsOnCPUCopy?.toArray().slice(0, 20)); - - // if (_hasValue(logit_bias)) { - // for (const tokenID in logit_bias) { - // const curBias = logit_bias[tokenID]; - // const curTokenID = parseInt(tokenID); - // if (curTokenID > vocab_size) { - // throw Error( - // "Token " + - // curTokenID + - // " in logit_bias exceeds vocab_size " + - // vocab_size, - // ); - // } - // logitsOnCPUArray[curTokenID] += curBias; - // } - // } - // this.logitsOnCPU.copyFrom(logitsOnCPUArray); - // console.log("After applying logit bias (CPU):", this.logitsOnCPU.toArray().slice(0, 20)); } - // if (JSON.stringify(this.logitsOnCPUCopy?.toArray()) !== JSON.stringify(this.logitsOnCPU.toArray())) { - // throw new Error("Logits on CPU and GPU do not match"); - // } - - // console.log("Penalties:", { - // frequency_penalty, - // presence_penalty, - // repetition_penalty, - // }); - // 3. Apply penalties to logits if ( frequency_penalty != 0.0 || @@ -1204,7 +1175,6 @@ export class LLMChatPipeline { const appearedTokensFreqs = [...this.appearedTokensFreq.values()]; const numTokens = appearedTokens.length; - // const paddedNumTokens = Math.ceil(numTokens / 4) * 4; const seqIdsArray = this.tvm .empty([1], "int32", this.device) @@ -1247,13 +1217,6 @@ export class LLMChatPipeline { .empty([1, this.fullVocabSize], "float32", this.device) .copyFrom(this.logitsOnCPU.toArray()); - // console.log("logitsOnGPU shape:", logitsOnGPU.shape); - // console.log("seqIdsArray shape:", seqIdsArray.shape); - // console.log("pos2seqIdsArray shape:", pos2seqIdsArray.shape); - // console.log("tokenIdsArray shape:", tokenIdsArray.shape); - // console.log("tokenCntArray shape:", tokenCntArray.shape); - // console.log("penaltiesArray shape:", penaltiesArray.shape); - if (numTokens > 0) { this.fapplyPenalty( logitsOnGPU, @@ -1268,75 +1231,6 @@ export class LLMChatPipeline { this.tvm.endScope(); } await this.device.sync(); - // console.log("After applying penalties (GPU):", this.logitsOnCPUCopy?.toArray().slice(0, 20)); - - // if (_hasValue(frequency_penalty) && _hasValue(presence_penalty)) { - // // 3.1. Use frequency and presence penalty - // this.tvm.beginScope(); - // // Both `keys()` and `values()` are in insertion order. - // const appearedTokens = [...this.appearedTokensFreq.keys()]; - // const appearedTokensFreqs = [...this.appearedTokensFreq.values()]; - // const appeared_tokens_ndarray = this.tvm.empty( - // [1, appearedTokens.length], - // "int32", - // this.tvm.cpu(), - // ); - // const appeared_tokens_freqs_ndarray = this.tvm.empty( - // [1, appearedTokensFreqs.length], - // "int32", - // this.tvm.cpu(), - // ); - // appeared_tokens_ndarray.copyFrom(appearedTokens); - // appeared_tokens_freqs_ndarray.copyFrom(appearedTokensFreqs); - // // let logitsOnCPUBefore = this.logitsOnCPU.toArray(); - // this.tvm.applyPresenceAndFrequencyPenalty( - // this.logitsOnCPU, - // appeared_tokens_ndarray, - // appeared_tokens_freqs_ndarray, - // presence_penalty!, - // frequency_penalty!, - // ); - // // if ( - // // JSON.stringify(logitsOnCPUBefore) === - // // JSON.stringify(this.logitsOnCPU.toArray()) - // // ) { - // // console.log("No penalty applied"); - // // } - // this.tvm.endScope(); - // } else if (repetition_penalty != 1.0) { - // // 3.2. Use repetition penalty - // this.tvm.beginScope(); - // const appearedTokens = [...this.appearedTokensFreq.keys()]; - // const appeared_tokens_ndarray = this.tvm.empty( - // [1, appearedTokens.length], - // "int32", - // this.tvm.cpu(), - // ); - // appeared_tokens_ndarray.copyFrom(appearedTokens); - // this.tvm.applyRepetitionPenalty( - // this.logitsOnCPU, - // appeared_tokens_ndarray, - // repetition_penalty, - // ); - // this.tvm.endScope(); - // } - // // console.log("After applying penalties (CPU):", this.logitsOnCPU.toArray().slice(0, 20)); - - // if (this.logitsOnCPUCopy) { - // console.log("logitsOnCPUCopy shape:", this.logitsOnCPUCopy.shape); - // const logitsOnCPUArray = this.logitsOnCPU.toArray(); - // const logitsOnCPUCopyArray = this.logitsOnCPUCopy.toArray(); - // let flag = true; - // for (let i = 0; i < logitsOnCPUArray.length; i++) { - // if (Math.abs(logitsOnCPUArray[i] - logitsOnCPUCopyArray[i]) > 1e-6) { - // console.error(`Mismatch at index ${i}: CPU=${logitsOnCPUArray[i]}, GPU=${logitsOnCPUCopyArray[i]}`); - // flag = false; - // } - // } - // if (!flag) { - // throw new Error("Logits on CPU and GPU do not match within tolerance"); - // } - // } // 4. Sample token from logits // If logprobs, need the actual distribution via softmax, otherwise directly sample from logits @@ -1345,25 +1239,6 @@ export class LLMChatPipeline { // Inplace transform logitsOnCPU to a distribution temperature = Math.max(1e-6, temperature); // to prevent division by zero - if (this.logitsOnCPU.shape[2] !== this.fullVocabSize) { - throw new Error("Logits vocab size does not match full vocab size"); - } - - // this.tvm.beginScope(); - // const testArray = new Float32Array(this.fullVocabSize); - // for (let i = 0; i < this.fullVocabSize; i++) { - // testArray[i] = Math.random() * 10; - // } - // console.log("Test array:", testArray.slice(0, 20)); - // const detachedTestArrayOnGPU = this.tvm.detachFromCurrentScope( - // this.tvm.empty([1, 1, this.fullVocabSize], "float32", this.device).copyFrom(testArray) - // ); - // await this.device.sync(); - // const detachedTestArrayOnCPU = this.tvm.detachFromCurrentScope( - // this.tvm.empty([testArray.length], "float32", this.tvm.cpu()).copyFrom(testArray) - // ); - // this.tvm.endScope(); - const numSeqs = 1; const numTokens = this.appearedTokensFreq.size; @@ -1379,12 +1254,6 @@ export class LLMChatPipeline { .empty([numSeqs, 1, this.fullVocabSize], "float32", this.device) .copyFrom(this.logitsOnCPU.toArray()); - // const detachedTestArrayOnGPUCopy = this.tvm - // .empty(detachedTestArrayOnGPU.shape, detachedTestArrayOnGPU.dtype, this.tvm.cpu()) - // .copyFrom(detachedTestArrayOnGPU); - - // await this.device.sync(); - const probs = this.fsoftmaxWithTemperature( logitsOnGPU, temperaturesArray, @@ -1404,22 +1273,6 @@ export class LLMChatPipeline { this.getTokenLogprob(sampledToken, top_logprobs!), ); } - - // if (numTokens > 0 && this.logitsOnCPUCopy) { - // console.log("logitsOnCPUCopy shape:", this.logitsOnCPUCopy.shape); - // const logitsOnCPUArray = this.logitsOnCPU.toArray(); - // const logitsOnCPUCopyArray = this.logitsOnCPUCopy.toArray(); - // let flag = true; - // for (let i = 0; i < logitsOnCPUArray.length; i++) { - // if (Math.abs(logitsOnCPUArray[i] - logitsOnCPUCopyArray[i]) > 1e-6) { - // console.error(`Mismatch at index ${i}: CPU=${logitsOnCPUArray[i]}, GPU=${logitsOnCPUCopyArray[i]}`); - // flag = false; - // } - // } - // if (!flag) { - // throw new Error("Logits on CPU and GPU do not match within tolerance"); - // } - // } } else { // temperature being 0 is allowed here, equivalent to argmax sampledToken = this.tvm.sampleTopPFromLogits( From dee9d19e438afe290ca6d0a3a81584e42cdc84a7 Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Fri, 6 Jun 2025 22:59:04 -0400 Subject: [PATCH 03/14] remove unnecessary loops --- src/llm_chat.ts | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/src/llm_chat.ts b/src/llm_chat.ts index 3b3be448..2c7d62a6 100644 --- a/src/llm_chat.ts +++ b/src/llm_chat.ts @@ -53,8 +53,6 @@ export class LLMChatPipeline { private fapplyPenalty: tvmjs.PackedFunc; private fapplyLogitBias: tvmjs.PackedFunc; private fsoftmaxWithTemperature: tvmjs.PackedFunc; - // private frenormalizeByTopP: tvmjs.PackedFunc; //BatchRenormalizeProbsByTopP - // private //BatchSampleTokensImpl, ChunkSampleTokensImpl // Functions related to PagedKVCache private fclearKVCaches: tvmjs.PackedFunc; @@ -68,7 +66,6 @@ export class LLMChatPipeline { private params: tvmjs.TVMObject; private kvCache: tvmjs.TVMObject; private logitsOnCPU?: tvmjs.NDArray = undefined; - private logitsOnCPUCopy?: tvmjs.NDArray = undefined; private filledKVCacheLength = 0; // meta data @@ -318,7 +315,6 @@ export class LLMChatPipeline { this.kvCache.dispose(); this.fclearKVCaches.dispose(); this.logitsOnCPU?.dispose(); - this.logitsOnCPUCopy?.dispose(); this.tvm.dispose(); this.tokenizer.dispose(); this.xgTokenizerInfo?.dispose(); @@ -974,20 +970,6 @@ export class LLMChatPipeline { return this.logitsOnCPU; } - private updateLogitsOnCPUCopy(logits: tvmjs.NDArray): tvmjs.NDArray { - if (this.logitsOnCPUCopy == undefined) { - this.logitsOnCPUCopy = this.tvm.detachFromCurrentScope( - this.tvm.empty(logits.shape, logits.dtype, this.tvm.cpu()), - ); - } else { - if (logits.shape[0] != this.logitsOnCPUCopy.shape[0]) { - throw Error("We expect the size of logits to remain unchanged"); - } - } - this.logitsOnCPUCopy.copyFrom(logits); - return this.logitsOnCPUCopy; - } - private async sampleTokenFromLogits( logitsOnGPU: tvmjs.NDArray, genConfig?: GenerationConfig, @@ -1126,14 +1108,15 @@ export class LLMChatPipeline { if (_hasValue(logit_bias)) { this.tvm.beginScope(); const numTokens = Object.keys(logit_bias ?? {}).length; - const pos2seq_id = new Int32Array(numTokens); + const pos2seq_id = new Int32Array(numTokens).fill(0); const tokenIds = new Int32Array(numTokens); const tokenLogitBias = new Float32Array(numTokens); + const logitBiasKeys = Object.keys(logit_bias ?? {}); for (let index = 0; index < numTokens; index++) { - pos2seq_id[index] = 0; - tokenIds[index] = parseInt(Object.keys(logit_bias ?? {})[index]); - tokenLogitBias[index] = logit_bias![tokenIds[index]]; + const tokenId = parseInt(logitBiasKeys[index]); + tokenIds[index] = tokenId; + tokenLogitBias[index] = logit_bias![tokenId]; } const pos2seqIdsArray = this.tvm @@ -1191,11 +1174,8 @@ export class LLMChatPipeline { const paddedPenalties = new Float32Array(3); paddedPenalties.set(penalties); - for (let index = 0; index < numTokens; index++) { - pos2seq_id[index] = 0; - tokenIds[index] = appearedTokens[index]; - tokenCnt[index] = appearedTokensFreqs[index]; - } + tokenIds.set(appearedTokens); + tokenCnt.set(appearedTokensFreqs); const pos2seqIdsArray = this.tvm .empty([numTokens], "int32", this.device) From 10ed01079bc6fd7c91d9b4d69ce7af46fc2065cf Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Thu, 28 Aug 2025 17:26:13 -0400 Subject: [PATCH 04/14] Remove unnecessary GPU-CPU copies and fix scope issues --- src/llm_chat.ts | 190 ++++++++++++++++++++++++------------------------ 1 file changed, 95 insertions(+), 95 deletions(-) diff --git a/src/llm_chat.ts b/src/llm_chat.ts index 2c7d62a6..4272cb71 100644 --- a/src/llm_chat.ts +++ b/src/llm_chat.ts @@ -1085,66 +1085,60 @@ export class LLMChatPipeline { this.tvm.endScope(); } - // 1. Move logits to CPU - this.tvm.beginScope(); - this.updateLogitsOnCPU(logitsOnGPU); - this.tvm.endScope(); - await this.device.sync(); - - if (this.logitsOnCPU == undefined) { - throw Error("logits should be assigned"); - } + // 1. Post process logits via logitProcessor and/or logit_bias + if (this.logitProcessor !== undefined) { + // Move logits to CPU + this.tvm.beginScope(); + this.updateLogitsOnCPU(logitsOnGPU); + this.tvm.endScope(); + await this.device.sync(); - // 2. Post process logits via logitProcessor and/or logit_bias - if (this.logitProcessor !== undefined || _hasValue(logit_bias)) { + if (this.logitsOnCPU == undefined) { + throw Error("logits should be assigned"); + } let logitsOnCPUArray: Float32Array = ( this.logitsOnCPU.toArray() ); - const vocab_size = logitsOnCPUArray.length; - if (this.logitProcessor !== undefined) { - logitsOnCPUArray = this.logitProcessor.processLogits(logitsOnCPUArray); + logitsOnCPUArray = this.logitProcessor.processLogits(logitsOnCPUArray); + logitsOnGPU.copyFrom(logitsOnCPUArray); + this.logitsOnCPU.copyFrom(logitsOnCPUArray); + } + + if (_hasValue(logit_bias)) { + const numTokens = Object.keys(logit_bias ?? {}).length; + const pos2seq_id = new Int32Array(numTokens).fill(0); + const tokenIds = new Int32Array(numTokens); + const tokenLogitBias = new Float32Array(numTokens); + + const logitBiasKeys = Object.keys(logit_bias ?? {}); + for (let index = 0; index < numTokens; index++) { + const tokenId = parseInt(logitBiasKeys[index]); + tokenIds[index] = tokenId; + tokenLogitBias[index] = logit_bias![tokenId]; } - if (_hasValue(logit_bias)) { - this.tvm.beginScope(); - const numTokens = Object.keys(logit_bias ?? {}).length; - const pos2seq_id = new Int32Array(numTokens).fill(0); - const tokenIds = new Int32Array(numTokens); - const tokenLogitBias = new Float32Array(numTokens); - - const logitBiasKeys = Object.keys(logit_bias ?? {}); - for (let index = 0; index < numTokens; index++) { - const tokenId = parseInt(logitBiasKeys[index]); - tokenIds[index] = tokenId; - tokenLogitBias[index] = logit_bias![tokenId]; - } + this.tvm.beginScope(); - const pos2seqIdsArray = this.tvm - .empty([numTokens], "int32", this.device) - .copyFrom(pos2seq_id); + const pos2seqIdsArray = this.tvm + .empty([numTokens], "int32", this.device) + .copyFrom(pos2seq_id); - const tokenIdsArray = this.tvm - .empty([numTokens], "int32", this.device) - .copyFrom(tokenIds); + const tokenIdsArray = this.tvm + .empty([numTokens], "int32", this.device) + .copyFrom(tokenIds); - const tokenLogitBiasArray = this.tvm - .empty([numTokens], "float32", this.device) - .copyFrom(tokenLogitBias); + const tokenLogitBiasArray = this.tvm + .empty([numTokens], "float32", this.device) + .copyFrom(tokenLogitBias); - const logitsOnGPU = this.tvm - .empty([1, this.fullVocabSize], "float32", this.device) - .copyFrom(logitsOnCPUArray); + this.fapplyLogitBias( + logitsOnGPU.view([1, this.fullVocabSize]), + pos2seqIdsArray, + tokenIdsArray, + tokenLogitBiasArray, + ); - this.fapplyLogitBias( - logitsOnGPU, - pos2seqIdsArray, - tokenIdsArray, - tokenLogitBiasArray, - ); - this.updateLogitsOnCPU(logitsOnGPU); - this.tvm.endScope(); - } - await this.device.sync(); + this.tvm.endScope(); } // 3. Apply penalties to logits @@ -1153,64 +1147,57 @@ export class LLMChatPipeline { presence_penalty != 0.0 || repetition_penalty != 1.0 ) { - this.tvm.beginScope(); const appearedTokens = [...this.appearedTokensFreq.keys()]; const appearedTokensFreqs = [...this.appearedTokensFreq.values()]; const numTokens = appearedTokens.length; - const seqIdsArray = this.tvm - .empty([1], "int32", this.device) - .copyFrom([0]); + if (numTokens > 0) { + const pos2seq_id = new Int32Array(numTokens).fill(0); + const tokenIds = new Int32Array(numTokens).fill(0); + const tokenCnt = new Int32Array(numTokens).fill(0); + const penalties = new Float32Array([ + presence_penalty, + frequency_penalty, + repetition_penalty, + ]); - const pos2seq_id = new Int32Array(numTokens).fill(0); - const tokenIds = new Int32Array(numTokens).fill(0); - const tokenCnt = new Int32Array(numTokens).fill(0); - const penalties = new Float32Array([ - presence_penalty, - frequency_penalty, - repetition_penalty, - ]); - const paddedPenalties = new Float32Array(3); - paddedPenalties.set(penalties); - - tokenIds.set(appearedTokens); - tokenCnt.set(appearedTokensFreqs); + tokenIds.set(appearedTokens); + tokenCnt.set(appearedTokensFreqs); - const pos2seqIdsArray = this.tvm - .empty([numTokens], "int32", this.device) - .copyFrom(pos2seq_id); + this.tvm.beginScope(); + const seqIdsArray = this.tvm + .empty([1], "int32", this.device) + .copyFrom([0]); - const tokenIdsArray = this.tvm - .empty([numTokens], "int32", this.device) - .copyFrom(tokenIds); + const pos2seqIdsArray = this.tvm + .empty([numTokens], "int32", this.device) + .copyFrom(pos2seq_id); - const tokenCntArray = this.tvm - .empty([numTokens], "int32", this.device) - .copyFrom(tokenCnt); + const tokenIdsArray = this.tvm + .empty([numTokens], "int32", this.device) + .copyFrom(tokenIds); - const penaltiesArray = this.tvm - .empty([1, 3], "float32", this.device) - .copyFrom(paddedPenalties); + const tokenCntArray = this.tvm + .empty([numTokens], "int32", this.device) + .copyFrom(tokenCnt); - const logitsOnGPU = this.tvm - .empty([1, this.fullVocabSize], "float32", this.device) - .copyFrom(this.logitsOnCPU.toArray()); + const penaltiesArray = this.tvm + .empty([1, 3], "float32", this.device) + .copyFrom(penalties); - if (numTokens > 0) { this.fapplyPenalty( - logitsOnGPU, + logitsOnGPU.view([1, this.fullVocabSize]), seqIdsArray, pos2seqIdsArray, tokenIdsArray, tokenCntArray, penaltiesArray, ); + + this.tvm.endScope(); } - this.updateLogitsOnCPU(logitsOnGPU); - this.tvm.endScope(); } - await this.device.sync(); // 4. Sample token from logits // If logprobs, need the actual distribution via softmax, otherwise directly sample from logits @@ -1230,33 +1217,46 @@ export class LLMChatPipeline { .empty([numSeqs], "float32", this.device) .copyFrom(temperatures); - const logitsOnGPU = this.tvm - .empty([numSeqs, 1, this.fullVocabSize], "float32", this.device) - .copyFrom(this.logitsOnCPU.toArray()); - const probs = this.fsoftmaxWithTemperature( - logitsOnGPU, + logitsOnGPU.view([numSeqs, 1, this.fullVocabSize]), temperaturesArray, ); this.updateLogitsOnCPU(probs); this.tvm.endScope(); await this.device.sync(); - sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU, top_p); + // sampledToken = this.fsampleWithTopP( + // probs.view([numSeqs, 1, this.fullVocabSize]), + // top_p, + // top_logprobs, + // this.fullVocabSize, + // this.appearedTokensFreq, + // this.tokenLogprobArray, + // ) + + sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU!, top_p); this.tokenLogprobArray.push( this.getTokenLogprob(sampledToken, top_logprobs!), ); } else { - this.tvm.applySoftmaxWithTemperature(this.logitsOnCPU, temperature); - sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU, top_p); + this.tvm.beginScope(); + this.updateLogitsOnCPU(logitsOnGPU); + this.tvm.endScope(); + await this.device.sync(); + this.tvm.applySoftmaxWithTemperature(this.logitsOnCPU!, temperature); + sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU!, top_p); this.tokenLogprobArray.push( this.getTokenLogprob(sampledToken, top_logprobs!), ); } } else { // temperature being 0 is allowed here, equivalent to argmax + this.tvm.beginScope(); + this.updateLogitsOnCPU(logitsOnGPU); + this.tvm.endScope(); + await this.device.sync(); sampledToken = this.tvm.sampleTopPFromLogits( - this.logitsOnCPU, + this.logitsOnCPU!, temperature, top_p, ); From 2f21df77e936ecc598b0aca917ec0e9f04924186 Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Fri, 5 Sep 2025 19:08:19 -0400 Subject: [PATCH 05/14] Update comments and remove unnecessary control logic for token sampling --- src/llm_chat.ts | 67 +++++++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 39 deletions(-) diff --git a/src/llm_chat.ts b/src/llm_chat.ts index 4272cb71..66c866f6 100644 --- a/src/llm_chat.ts +++ b/src/llm_chat.ts @@ -1085,7 +1085,7 @@ export class LLMChatPipeline { this.tvm.endScope(); } - // 1. Post process logits via logitProcessor and/or logit_bias + // 1. Apply logitProcessor on CPU if (this.logitProcessor !== undefined) { // Move logits to CPU this.tvm.beginScope(); @@ -1104,6 +1104,7 @@ export class LLMChatPipeline { this.logitsOnCPU.copyFrom(logitsOnCPUArray); } + // 2. Apply logit_bias on GPU if (_hasValue(logit_bias)) { const numTokens = Object.keys(logit_bias ?? {}).length; const pos2seq_id = new Int32Array(numTokens).fill(0); @@ -1209,46 +1210,34 @@ export class LLMChatPipeline { const numSeqs = 1; const numTokens = this.appearedTokensFreq.size; - if (numTokens > 0) { - const temperatures = new Float32Array([temperature]); + const temperatures = new Float32Array([temperature]); - this.tvm.beginScope(); - const temperaturesArray = this.tvm - .empty([numSeqs], "float32", this.device) - .copyFrom(temperatures); + this.tvm.beginScope(); + const temperaturesArray = this.tvm + .empty([numSeqs], "float32", this.device) + .copyFrom(temperatures); - const probs = this.fsoftmaxWithTemperature( - logitsOnGPU.view([numSeqs, 1, this.fullVocabSize]), - temperaturesArray, - ); - this.updateLogitsOnCPU(probs); - this.tvm.endScope(); - await this.device.sync(); - - // sampledToken = this.fsampleWithTopP( - // probs.view([numSeqs, 1, this.fullVocabSize]), - // top_p, - // top_logprobs, - // this.fullVocabSize, - // this.appearedTokensFreq, - // this.tokenLogprobArray, - // ) - - sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU!, top_p); - this.tokenLogprobArray.push( - this.getTokenLogprob(sampledToken, top_logprobs!), - ); - } else { - this.tvm.beginScope(); - this.updateLogitsOnCPU(logitsOnGPU); - this.tvm.endScope(); - await this.device.sync(); - this.tvm.applySoftmaxWithTemperature(this.logitsOnCPU!, temperature); - sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU!, top_p); - this.tokenLogprobArray.push( - this.getTokenLogprob(sampledToken, top_logprobs!), - ); - } + const probs = this.fsoftmaxWithTemperature( + logitsOnGPU.view([numSeqs, 1, this.fullVocabSize]), + temperaturesArray, + ); + this.updateLogitsOnCPU(probs); + this.tvm.endScope(); + await this.device.sync(); + + // sampledToken = this.fsampleWithTopP( + // probs.view([numSeqs, 1, this.fullVocabSize]), + // top_p, + // top_logprobs, + // this.fullVocabSize, + // this.appearedTokensFreq, + // this.tokenLogprobArray, + // ) + + sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU!, top_p); + this.tokenLogprobArray.push( + this.getTokenLogprob(sampledToken, top_logprobs!), + ); } else { // temperature being 0 is allowed here, equivalent to argmax this.tvm.beginScope(); From 7975304d9ce0d3cd1bbc66f0b54d20c7cbe1cc99 Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Tue, 9 Sep 2025 17:19:38 -0400 Subject: [PATCH 06/14] Fix bug with using provided repetition penalty value and add support for including sampling latency breakdown in response --- src/config.ts | 3 ++- src/engine.ts | 23 +++++++++++++++++++++ src/openai_api_protocols/chat_completion.ts | 21 ++++++++++++++++++- src/openai_api_protocols/completion.ts | 18 ++++++++++++++++ src/types.ts | 9 ++++++++ 5 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/config.ts b/src/config.ts index dfeb1913..0ca51eba 100644 --- a/src/config.ts +++ b/src/config.ts @@ -126,7 +126,7 @@ export interface MLCEngineConfig { */ export interface GenerationConfig { // Only used in MLC - repetition_penalty?: number; + repetition_penalty?: number | null; ignore_eos?: boolean; // Shared by MLC and OpenAI APIs top_p?: number | null; @@ -143,6 +143,7 @@ export interface GenerationConfig { response_format?: ResponseFormat | null; // extra_body in ChatCompletionsRequest enable_thinking?: boolean | null; + enable_latency_breakdown?: boolean | null; } export function postInitAndCheckGenerationConfigValues( diff --git a/src/engine.ts b/src/engine.ts index fe4c427e..6609f3e5 100644 --- a/src/engine.ts +++ b/src/engine.ts @@ -41,6 +41,7 @@ import { MLCEngineInterface, LogitProcessor, LogLevel, + LatencyBreakdown, } from "./types"; import { compareConversationObject, @@ -694,12 +695,18 @@ export class MLCEngine implements MLCEngineInterface { const decode_time = pipeline.getCurRoundDecodingTotalTime(); const grammar_per_token_s = pipeline.getCurRoundGrammarPerTokenTotalTime(); + const latencyBreakdown: LatencyBreakdown = + pipeline.getCurRoundLatencyBreakdown(); + const defaultExtra = { e2e_latency_s: (Date.now() - timeReceived) / 1000, prefill_tokens_per_s: prefill_tokens_per_s, decode_tokens_per_s: decode_tokens_per_s, time_to_first_token_s: prefill_time, time_per_output_token_s: decode_time / completion_tokens, + latencyBreakdown: request.extra_body?.enable_latency_breakdown + ? latencyBreakdown + : undefined, }; const usage: CompletionUsage = { completion_tokens: completion_tokens, @@ -783,6 +790,7 @@ export class MLCEngine implements MLCEngineInterface { const genConfig: GenerationConfig = { frequency_penalty: request.frequency_penalty, presence_penalty: request.presence_penalty, + repetition_penalty: request.repetition_penalty, max_tokens: request.max_tokens, stop: request.stop, top_p: request.top_p, @@ -793,6 +801,7 @@ export class MLCEngine implements MLCEngineInterface { response_format: request.response_format, ignore_eos: request.ignore_eos, enable_thinking: request.extra_body?.enable_thinking, + enable_latency_breakdown: request.extra_body?.enable_latency_breakdown, }; // 0.5 Block wait until this pipeline finishes all previous requests @@ -890,12 +899,19 @@ export class MLCEngine implements MLCEngineInterface { "response_format" in request && (request.response_format?.type === "grammar" || request.response_format?.type === "json_object"); + + const latencyBreakdown: LatencyBreakdown = + selectedPipeline.getCurRoundLatencyBreakdown(); + const defaultExtra = { e2e_latency_s: (Date.now() - timeReceived) / 1000, prefill_tokens_per_s: prompt_tokens / prefill_time, decode_tokens_per_s: completion_tokens / decode_time, time_to_first_token_s: prefill_time, time_per_output_token_s: decode_time / completion_tokens, + latencyBreakdown: request.extra_body?.enable_latency_breakdown + ? latencyBreakdown + : undefined, }; const response: ChatCompletion = { id: crypto.randomUUID(), @@ -958,6 +974,7 @@ export class MLCEngine implements MLCEngineInterface { const genConfig: GenerationConfig = { frequency_penalty: request.frequency_penalty, presence_penalty: request.presence_penalty, + repetition_penalty: request.repetition_penalty, max_tokens: request.max_tokens, stop: request.stop, top_p: request.top_p, @@ -1030,6 +1047,9 @@ export class MLCEngine implements MLCEngineInterface { decode_time += selectedPipeline.getCurRoundDecodingTotalTime(); } + const latencyBreakdown: LatencyBreakdown = + selectedPipeline.getCurRoundLatencyBreakdown(); + const response: Completion = { id: crypto.randomUUID(), choices: choices, @@ -1046,6 +1066,9 @@ export class MLCEngine implements MLCEngineInterface { decode_tokens_per_s: completion_tokens / decode_time, time_to_first_token_s: prefill_time, time_per_output_token_s: decode_time / completion_tokens, + latencyBreakdown: request.extra_body?.enable_latency_breakdown + ? latencyBreakdown + : undefined, }, } as CompletionUsage, }; diff --git a/src/openai_api_protocols/chat_completion.ts b/src/openai_api_protocols/chat_completion.ts index 5a5229ed..e1ee031a 100644 --- a/src/openai_api_protocols/chat_completion.ts +++ b/src/openai_api_protocols/chat_completion.ts @@ -15,7 +15,7 @@ * limitations under the License. */ -import { MLCEngineInterface } from "../types"; +import { MLCEngineInterface, LatencyBreakdown } from "../types"; import { functionCallingModelIds, MessagePlaceholders, @@ -125,6 +125,13 @@ export interface ChatCompletionRequestBase { */ presence_penalty?: number | null; + /** + * Number greater than or equal to 1.0. Values greater than 1.0 discourage + * the model from repeating tokens that have already been generated. Repetition + * penalty is like presence penalty but is multiplicative. + */ + repetition_penalty?: number | null; + /** * The maximum number of [tokens](/tokenizer) that can be generated in the chat * completion. @@ -268,6 +275,12 @@ export interface ChatCompletionRequestBase { * @note Currently only allowed to be used for Qwen3 models, though not explicitly checked. */ enable_thinking?: boolean | null; + + /** + * If set to true, the response will include a breakdown of the time spent in various + * stages of token sampling. + */ + enable_latency_breakdown?: boolean | null; }; } @@ -980,6 +993,12 @@ export interface CompletionUsage { * structured output. If n > 1, it is the average over all choices. */ grammar_per_token_s?: number; + + /** + * If `enable_latency_breakdown` is set to true in the request, this field will be + * present and contain a breakdown of the time spent in various stages of token sampling. + */ + latencyBreakdown?: LatencyBreakdown; }; } diff --git a/src/openai_api_protocols/completion.ts b/src/openai_api_protocols/completion.ts index fb6aa458..54fcc34f 100644 --- a/src/openai_api_protocols/completion.ts +++ b/src/openai_api_protocols/completion.ts @@ -137,6 +137,13 @@ export interface CompletionCreateParamsBase { */ presence_penalty?: number | null; + /** + * Number greater than or equal to 1.0. Values greater than 1.0 discourage + * the model from repeating tokens that have already been generated. Repetition + * penalty is like presence penalty but is multiplicative. + */ + repetition_penalty?: number | null; + /** * If specified, our system will make a best effort to sample deterministically, * such that repeated requests with the same `seed` and parameters should return @@ -225,6 +232,17 @@ export interface CompletionCreateParamsBase { * @note This field is not supported. */ best_of?: number | null; + + /** + * Fields specific to WebLLM, not present in OpenAI. + */ + extra_body?: { + /** + * If set to true, the response will include a breakdown of the time spent in various + * stages of token sampling. + */ + enable_latency_breakdown?: boolean | null; + }; } export type CompletionCreateParams = diff --git a/src/types.ts b/src/types.ts index 4d4522c0..ed79af57 100644 --- a/src/types.ts +++ b/src/types.ts @@ -251,3 +251,12 @@ export const LOG_LEVELS = { SILENT: 5, }; export type LogLevel = keyof typeof LOG_LEVELS; + +export type LatencyBreakdown = { + logitProcessorTime: number[]; + logitBiasTime: number[]; + penaltyTime: number[]; + sampleTime: number[]; + totalTime: number[]; + grammarBitmaskTime: number[]; +}; From b78e4080dc6410c57ad278813324cf7b181f3c02 Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Tue, 9 Sep 2025 17:21:57 -0400 Subject: [PATCH 07/14] Update comments and add timing for sampling steps --- src/llm_chat.ts | 93 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 82 insertions(+), 11 deletions(-) diff --git a/src/llm_chat.ts b/src/llm_chat.ts index 66c866f6..7266b5e4 100644 --- a/src/llm_chat.ts +++ b/src/llm_chat.ts @@ -6,7 +6,7 @@ import log from "loglevel"; import { Tokenizer } from "@mlc-ai/web-tokenizers"; import { ChatConfig, GenerationConfig, Role } from "./config"; import { getConversation, Conversation } from "./conversation"; -import { LogitProcessor } from "./types"; +import { LogitProcessor, LatencyBreakdown } from "./types"; import { getChunkedPrefillInputData, getImageDataFromURL, @@ -102,6 +102,16 @@ export class LLMChatPipeline { private curRoundDecodingTotalTime = 0; private curRoundPrefillTotalTime = 0; + // additional stats, reset at every prefillStep() + public curRoundLatencyBreakdown: LatencyBreakdown = { + logitProcessorTime: [], + logitBiasTime: [], + penaltyTime: [], + sampleTime: [], + totalTime: [], + grammarBitmaskTime: [], + }; + // LogitProcessor private logitProcessor?: LogitProcessor = undefined; @@ -434,6 +444,13 @@ export class LLMChatPipeline { return this.curRoundGrammarPerTokenTotalTime; } + /** + * @returns the breakdown of latencies for sampling each token for a single request. + */ + getCurRoundLatencyBreakdown(): LatencyBreakdown { + return this.curRoundLatencyBreakdown; + } + /** * @returns Runtime stats information. */ @@ -527,6 +544,16 @@ export class LLMChatPipeline { this.curRoundDecodingTotalTime = 0; this.curRoundGrammarInitTotalTime = 0; this.curRoundGrammarPerTokenTotalTime = 0; + + this.curRoundLatencyBreakdown = { + logitProcessorTime: [], + logitBiasTime: [], + penaltyTime: [], + sampleTime: [], + totalTime: [], + grammarBitmaskTime: [], + }; + this.stopTriggered = false; const conversation = this.conversation; @@ -1049,11 +1076,15 @@ export class LLMChatPipeline { throw new RangeError("presence_penalty", -2.0, 2.0); } + const outputTokenBegin = performance.now(); + // 0. Update logitsOnGPU with on-GPU grammar bitmasking if ( response_format?.type === "json_object" || response_format?.type === "grammar" ) { + const grammarBitmaskBegin = performance.now(); + this.tvm.beginScope(); if (this.grammarMatcher === undefined) { throw Error("Expect grammar matcher to be initialized."); @@ -1083,6 +1114,15 @@ export class LLMChatPipeline { bitMaskOnGPU, ); this.tvm.endScope(); + + if (genConfig?.enable_latency_breakdown) { + const grammarBitmaskEnd = performance.now(); + const grammarBitmaskTimeSpent = + (grammarBitmaskEnd - grammarBitmaskBegin) / 1e3; + this.curRoundLatencyBreakdown.grammarBitmaskTime.push( + grammarBitmaskTimeSpent, + ); + } } // 1. Apply logitProcessor on CPU @@ -1093,6 +1133,8 @@ export class LLMChatPipeline { this.tvm.endScope(); await this.device.sync(); + const logitProcessorBegin = performance.now(); + if (this.logitsOnCPU == undefined) { throw Error("logits should be assigned"); } @@ -1102,10 +1144,21 @@ export class LLMChatPipeline { logitsOnCPUArray = this.logitProcessor.processLogits(logitsOnCPUArray); logitsOnGPU.copyFrom(logitsOnCPUArray); this.logitsOnCPU.copyFrom(logitsOnCPUArray); + + if (genConfig?.enable_latency_breakdown) { + const logitProcessorEnd = performance.now(); + const logitProcessorTimeSpent = + (logitProcessorEnd - logitProcessorBegin) / 1e3; + this.curRoundLatencyBreakdown.logitProcessorTime.push( + logitProcessorTimeSpent, + ); + } } // 2. Apply logit_bias on GPU if (_hasValue(logit_bias)) { + const logitBiasBegin = performance.now(); + const numTokens = Object.keys(logit_bias ?? {}).length; const pos2seq_id = new Int32Array(numTokens).fill(0); const tokenIds = new Int32Array(numTokens); @@ -1140,9 +1193,15 @@ export class LLMChatPipeline { ); this.tvm.endScope(); + + if (genConfig?.enable_latency_breakdown) { + const logitBiasEnd = performance.now(); + const logitBiasTimeSpent = (logitBiasEnd - logitBiasBegin) / 1e3; + this.curRoundLatencyBreakdown.logitBiasTime.push(logitBiasTimeSpent); + } } - // 3. Apply penalties to logits + // 3. Apply penalties to logits on GPU if ( frequency_penalty != 0.0 || presence_penalty != 0.0 || @@ -1154,6 +1213,8 @@ export class LLMChatPipeline { const numTokens = appearedTokens.length; if (numTokens > 0) { + const penaltyBegin = performance.now(); + const pos2seq_id = new Int32Array(numTokens).fill(0); const tokenIds = new Int32Array(numTokens).fill(0); const tokenCnt = new Int32Array(numTokens).fill(0); @@ -1197,11 +1258,18 @@ export class LLMChatPipeline { ); this.tvm.endScope(); + + if (genConfig?.enable_latency_breakdown) { + const penaltyEnd = performance.now(); + const penaltyTimeSpent = (penaltyEnd - penaltyBegin) / 1e3; + this.curRoundLatencyBreakdown.penaltyTime.push(penaltyTimeSpent); + } } } // 4. Sample token from logits // If logprobs, need the actual distribution via softmax, otherwise directly sample from logits + const sampleBegin = performance.now(); let sampledToken: number; if (logprobs) { // Inplace transform logitsOnCPU to a distribution @@ -1225,15 +1293,6 @@ export class LLMChatPipeline { this.tvm.endScope(); await this.device.sync(); - // sampledToken = this.fsampleWithTopP( - // probs.view([numSeqs, 1, this.fullVocabSize]), - // top_p, - // top_logprobs, - // this.fullVocabSize, - // this.appearedTokensFreq, - // this.tokenLogprobArray, - // ) - sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU!, top_p); this.tokenLogprobArray.push( this.getTokenLogprob(sampledToken, top_logprobs!), @@ -1251,6 +1310,12 @@ export class LLMChatPipeline { ); } + if (genConfig?.enable_latency_breakdown) { + const sampleEnd = performance.now(); + const sampleTimeSpent = (sampleEnd - sampleBegin) / 1e3; + this.curRoundLatencyBreakdown.sampleTime.push(sampleTimeSpent); + } + // 5. Update logit processor this.logitProcessor?.processSampledToken(sampledToken); @@ -1271,6 +1336,12 @@ export class LLMChatPipeline { } } + if (genConfig?.enable_latency_breakdown) { + const outputTokenEnd = performance.now(); + const outputTokenTimeSpent = (outputTokenEnd - outputTokenBegin) / 1e3; + this.curRoundLatencyBreakdown.totalTime.push(outputTokenTimeSpent); + } + return sampledToken; } From 8cc787e13826cea7aae0a1afb279a0222abe371a Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Tue, 9 Sep 2025 18:04:41 -0400 Subject: [PATCH 08/14] Add sanity checks and latency breakdown examples --- .../get-started-latency-breakdown/README.md | 15 ++ .../package.json | 20 ++ .../src/get_started_latency_breakdown.html | 23 +++ .../src/get_started_latency_breakdown.ts | 164 +++++++++++++++ examples/sanity-checks/README.md | 14 ++ examples/sanity-checks/package.json | 20 ++ examples/sanity-checks/src/sanity_checks.html | 35 ++++ examples/sanity-checks/src/sanity_checks.ts | 187 ++++++++++++++++++ 8 files changed, 478 insertions(+) create mode 100644 examples/get-started-latency-breakdown/README.md create mode 100644 examples/get-started-latency-breakdown/package.json create mode 100644 examples/get-started-latency-breakdown/src/get_started_latency_breakdown.html create mode 100644 examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts create mode 100644 examples/sanity-checks/README.md create mode 100644 examples/sanity-checks/package.json create mode 100644 examples/sanity-checks/src/sanity_checks.html create mode 100644 examples/sanity-checks/src/sanity_checks.ts diff --git a/examples/get-started-latency-breakdown/README.md b/examples/get-started-latency-breakdown/README.md new file mode 100644 index 00000000..0a74ea26 --- /dev/null +++ b/examples/get-started-latency-breakdown/README.md @@ -0,0 +1,15 @@ +# WebLLM Get Started App + +This folder provides a minimum demo to show WebLLM API in a webapp setting with +collection of latency statistics for individual token sampling steps. +To try it out, you can do the following steps under this folder + +```bash +npm install +npm start +``` + +Note if you would like to hack WebLLM core package. +You can change web-llm dependencies as `"file:../.."`, and follow the build from source +instruction in the project to build webllm locally. This option is only recommended +if you would like to hack WebLLM core package. diff --git a/examples/get-started-latency-breakdown/package.json b/examples/get-started-latency-breakdown/package.json new file mode 100644 index 00000000..0b321e9d --- /dev/null +++ b/examples/get-started-latency-breakdown/package.json @@ -0,0 +1,20 @@ +{ + "name": "get-started-latency-breakdown", + "version": "0.1.0", + "private": true, + "scripts": { + "start": "parcel src/get_started_latency_breakdown.html --port 8888", + "build": "parcel build src/get_started_latency_breakdown.html --dist-dir lib" + }, + "devDependencies": { + "buffer": "^5.7.1", + "parcel": "^2.8.3", + "process": "^0.11.10", + "tslib": "^2.3.1", + "typescript": "^4.9.5", + "url": "^0.11.3" + }, + "dependencies": { + "@mlc-ai/web-llm": "^0.2.79" + } +} diff --git a/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.html b/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.html new file mode 100644 index 00000000..18298616 --- /dev/null +++ b/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.html @@ -0,0 +1,23 @@ + + + + +

WebLLM Test Page

+ Open console to see output +
+
+ + +

Prompt

+ + +

Response

+ +
+ + + + + diff --git a/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts b/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts new file mode 100644 index 00000000..26d9565e --- /dev/null +++ b/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts @@ -0,0 +1,164 @@ +import * as webllm from "@mlc-ai/web-llm"; + +function setLabel(id: string, text: string) { + const label = document.getElementById(id); + if (label == null) { + throw Error("Cannot find label " + id); + } + label.innerText = text; +} + +type LatencyBreakdown = { + logitProcessorTime: number[]; + logitBiasTime: number[]; + penaltyTime: number[]; + sampleTime: number[]; + totalTime: number[]; + grammarBitmaskTime: number[]; +}; +function computeStats( + latency_breakdown: LatencyBreakdown, +): Record { + function _computeStats(arr: number[]) { + if (!arr.length) return undefined; + const sorted = [...arr].sort((a, b) => a - b); + const sum = arr.reduce((a, b) => a + b, 0); + const avg = sum / arr.length; + const min = sorted[0]; + const max = sorted[sorted.length - 1]; + const p99 = sorted[Math.floor(0.99 * (sorted.length - 1))]; + return { avg, min, max, p99 }; + } + + const latencyStats: Record = {}; + for (const key of Object.keys(latency_breakdown)) { + const arr = (latency_breakdown as any)[key]; + if (Array.isArray(arr) && arr.length > 0) { + latencyStats[key] = _computeStats(arr); + } + } + return latencyStats; +} + +async function main() { + const initProgressCallback = (report: webllm.InitProgressReport) => { + setLabel("init-label", report.text); + }; + // Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts` + // const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC"; + const selectedModel = "Qwen3-0.6B-q0f32-MLC"; + const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine( + selectedModel, + { + initProgressCallback: initProgressCallback, + logLevel: "INFO", // specify the log level + }, + // customize kv cache, use either context_window_size or sliding_window_size (with attention sink) + { + context_window_size: 2048, + // sliding_window_size: 1024, + // attention_sink_size: 4, + }, + ); + + // Option 2: Specify your own model other than the prebuilt ones + // const appConfig: webllm.AppConfig = { + // model_list: [ + // { + // model: "https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC", + // model_id: "Llama-3.1-8B-Instruct-q4f32_1-MLC", + // model_lib: + // webllm.modelLibURLPrefix + + // webllm.modelVersion + + // "/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm", + // overrides: { + // context_window_size: 2048, + // }, + // }, + // ], + // }; + // const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine( + // selectedModel, + // { appConfig: appConfig, initProgressCallback: initProgressCallback }, + // ); + + // Option 3: Instantiate MLCEngine() and call reload() separately + // const engine: webllm.MLCEngineInterface = new webllm.MLCEngine({ + // appConfig: appConfig, // if do not specify, we use webllm.prebuiltAppConfig + // initProgressCallback: initProgressCallback, + // }); + // await engine.reload(selectedModel); + + const latencyBreakdown: LatencyBreakdown = { + logitProcessorTime: [], + logitBiasTime: [], + penaltyTime: [], + sampleTime: [], + totalTime: [], + grammarBitmaskTime: [], + }; + // want decode_tokens_per_s, e2e_latency_s, time_per_output_token_s, completion_tokens + const decodeTokensPerS: number[] = []; + const completionTokens: number[] = []; + const e2eLatencyS: number[] = []; + const timePerOutputTokenS: number[] = []; + + const numTrials = 20; + for (let i = 0; i < numTrials; i++) { + console.log(`Trial ${i + 1} / ${numTrials}`); + const reply0 = await engine.chat.completions.create({ + messages: [{ role: "user", content: "List twenty US states." }], + // below configurations are all optional + n: 1, + temperature: 0, + max_tokens: 2048, + // 46510 and 7188 are "California", and 8421 and 51325 are "Texas" in Llama-3.1-8B-Instruct + // So we would have a higher chance of seeing the latter two, but never the first in the answer + // logit_bias: { + // "46510": -100, + // "7188": -100, + // "8421": 5, + // "41325": 5, + // }, + top_p: 0.8, + logprobs: true, + top_logprobs: 2, + frequency_penalty: 1.2, + presence_penalty: 1.0, + repetition_penalty: 1.1, + }); + + const logitProcessorTime = + reply0.usage?.extra.latencyBreakdown?.logitProcessorTime; + const logitBiasTime = reply0.usage?.extra.latencyBreakdown?.logitBiasTime; + const penaltyTime = reply0.usage?.extra.latencyBreakdown?.penaltyTime; + const sampleTime = reply0.usage?.extra.latencyBreakdown?.sampleTime; + const totalTime = reply0.usage?.extra.latencyBreakdown?.totalTime; + const grammarBitmaskTime = + reply0.usage?.extra.latencyBreakdown?.grammarBitmaskTime; + + latencyBreakdown.logitProcessorTime.push(...(logitProcessorTime || [])); + latencyBreakdown.logitBiasTime.push(...(logitBiasTime || [])); + latencyBreakdown.penaltyTime.push(...(penaltyTime || [])); + latencyBreakdown.sampleTime.push(...(sampleTime || [])); + latencyBreakdown.totalTime.push(...(totalTime || [])); + latencyBreakdown.grammarBitmaskTime.push(...(grammarBitmaskTime || [])); + + decodeTokensPerS.push(reply0.usage?.extra.decode_tokens_per_s || 0); + e2eLatencyS.push(reply0.usage?.extra.e2e_latency_s || 0); + timePerOutputTokenS.push(reply0.usage?.extra.time_per_output_token_s || 0); + completionTokens.push(reply0.usage?.completion_tokens || 0); + } + + const latencyStats: { [key: string]: number } = + computeStats(latencyBreakdown); + console.log("Latency stats: ", latencyStats); + console.log("Decode tokens per second: ", decodeTokensPerS); + console.log("Completion tokens: ", completionTokens); + console.log("E2E latency (s): ", e2eLatencyS); + console.log("Time per output token (s): ", timePerOutputTokenS); + + // To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)` +} + +main(); diff --git a/examples/sanity-checks/README.md b/examples/sanity-checks/README.md new file mode 100644 index 00000000..8b65a916 --- /dev/null +++ b/examples/sanity-checks/README.md @@ -0,0 +1,14 @@ +# Sanity Checks for Generated Output + +This folder provides simple sanity checks on the output generated +using WebLLM. To try it out, you can do the following steps under this folder + +```bash +npm install +npm start +``` + +Note if you would like to hack WebLLM core package. +You can change web-llm dependencies as `"file:../.."`, and follow the build from source +instruction in the project to build webllm locally. This option is only recommended +if you would like to hack WebLLM core package. diff --git a/examples/sanity-checks/package.json b/examples/sanity-checks/package.json new file mode 100644 index 00000000..b40e6464 --- /dev/null +++ b/examples/sanity-checks/package.json @@ -0,0 +1,20 @@ +{ + "name": "sanity-checks", + "version": "0.1.0", + "private": true, + "scripts": { + "start": "parcel src/sanity_checks.html --port 8889", + "build": "parcel build src/sanity_checks.html --dist-dir lib" + }, + "devDependencies": { + "buffer": "^5.7.1", + "parcel": "^2.8.3", + "process": "^0.11.10", + "tslib": "^2.3.1", + "typescript": "^4.9.5", + "url": "^0.11.3" + }, + "dependencies": { + "@mlc-ai/web-llm": "^0.2.79" + } +} diff --git a/examples/sanity-checks/src/sanity_checks.html b/examples/sanity-checks/src/sanity_checks.html new file mode 100644 index 00000000..bfbfc869 --- /dev/null +++ b/examples/sanity-checks/src/sanity_checks.html @@ -0,0 +1,35 @@ + + + + + + GPU sampleTokenFromLogits Tests + + + +

GPU sampleTokenFromLogits Tests

+ +
Overall:
+
Not started.
+
Logit Processor:
+
+
Logit Bias:
+
+
Penalties:
+
+
Logprobs:
+
+ + + diff --git a/examples/sanity-checks/src/sanity_checks.ts b/examples/sanity-checks/src/sanity_checks.ts new file mode 100644 index 00000000..b0f51d1b --- /dev/null +++ b/examples/sanity-checks/src/sanity_checks.ts @@ -0,0 +1,187 @@ +import * as webllm from "@mlc-ai/web-llm"; + +function setLabel(id: string, text: string) { + const label = document.getElementById(id); + if (label == null) return; + label.innerText = text; +} + +async function createEngine( + modelId: string, + appConfig: webllm.AppConfig, + logitProcessorRegistry?: Map, +) { + return await webllm.CreateMLCEngine(modelId, { + appConfig, + logLevel: "ERROR", + logitProcessorRegistry, + }); +} + +async function deleteModel(modelId: string, appConfig: webllm.AppConfig) { + await webllm.deleteModelAllInfoInCache(modelId, appConfig); +} + +async function testLogitProcessor( + modelId: string, + appConfig: webllm.AppConfig, +) { + // Set up a logit processor that sets logits[0] = 100.0, rest -100.0 + const logitProcessor = { + processLogits: (logits: Float32Array) => { + logits.fill(-100.0); + logits[0] = 100.0; + return logits; + }, + processSampledToken: () => {}, + resetState: () => {}, + }; + const logitProcessorRegistry: Map = new Map(); + logitProcessorRegistry.set(modelId, logitProcessor); + const engine: webllm.MLCEngineInterface = await createEngine( + modelId, + appConfig, + logitProcessorRegistry, + ); + + const prompt = "Test logit processor."; + const reply: webllm.ChatCompletion = await engine.chat.completions.create({ + messages: [{ role: "user", content: prompt }], + temperature: 1.0, + max_tokens: 20, + logprobs: true, + top_logprobs: 1, + }); + const logprobs = reply.choices[0]?.logprobs; + const logprobsAllZero = !!( + logprobs && + Array.isArray(logprobs.content) && + logprobs.content.every( + (lp: webllm.ChatCompletionTokenLogprob) => + lp.top_logprobs[0].logprob === 0, + ) + ); + + console.log(`[LogitProcessor] Logprobs all zero: ${logprobsAllZero}`); + setLabel("logit-processor-label", `Logprobs all zero: ${logprobsAllZero}`); + await deleteModel(modelId, appConfig); + return logprobsAllZero; +} + +async function testLogitBias(modelId: string, appConfig: webllm.AppConfig) { + // Set logit_bias to strongly favor token 0 + const prompt = "Test logit bias."; + // const t0 = performance.now(); + const engine: webllm.MLCEngineInterface = await createEngine( + modelId, + appConfig, + ); + const reply = await engine.chat.completions.create({ + messages: [{ role: "user", content: prompt }], + temperature: 1.0, + max_tokens: 20, + logprobs: true, + top_logprobs: 1, + logit_bias: { "0": 100.0 }, + }); + const logprobs = reply.choices[0]?.logprobs; + const logprobsAllZero = !!( + logprobs && + Array.isArray(logprobs.content) && + logprobs.content.every( + (lp: webllm.ChatCompletionTokenLogprob) => + lp.top_logprobs[0].logprob === 0, + ) + ); + + console.log(`[LogitBias] Logprobs all zero: ${logprobsAllZero}`); + setLabel("logit-bias-label", `Logprobs all zero: ${logprobsAllZero}`); + await deleteModel(modelId, appConfig); + return logprobsAllZero; +} + +async function testPenalties(modelId: string, appConfig: webllm.AppConfig) { + const prompt = "Test presence and frequency penalties."; + const engine: webllm.MLCEngineInterface = await createEngine( + modelId, + appConfig, + ); + const reply = await engine.chat.completions.create({ + messages: [{ role: "user", content: prompt }], + temperature: 1.0, + max_tokens: 256, + presence_penalty: 2.0, + frequency_penalty: 2.0, + logit_bias: { "0": 100.0 }, + logprobs: true, + }); + const logprobs = reply.choices[0]?.logprobs; + const logprobsNotAllZero = !logprobs?.content?.every( + (lp: webllm.ChatCompletionTokenLogprob) => lp.logprob === 0, + ); + console.log(`[Penalties] Logprobs not all zero: ${logprobsNotAllZero}`); + setLabel("penalty-label", `Logprobs not all zero: ${logprobsNotAllZero}`); + await deleteModel(modelId, appConfig); + return logprobsNotAllZero; +} + +async function testLogprobs(modelId: string, appConfig: webllm.AppConfig) { + // Test logprobs: check that logprobs are returned and sum to ~1 after exp + const prompt = "Test logprobs."; + const t0 = performance.now(); + const engine: webllm.MLCEngineInterface = await createEngine( + modelId, + appConfig, + ); + const reply = await engine.chat.completions.create({ + messages: [{ role: "user", content: prompt }], + temperature: 1.0, + max_tokens: 20, + logprobs: true, + top_logprobs: 5, + }); + const t1 = performance.now(); + const logprobs = reply.choices[0]?.logprobs; + + let logprobsAllCloseTo1 = true; + for (const lp of logprobs?.content || []) { + const expSum = lp.top_logprobs?.reduce( + (acc: number, val: webllm.TopLogprob) => acc + Math.exp(val.logprob), + 0, + ); + logprobsAllCloseTo1 &&= Math.abs(expSum - 1.0) < 0.1; + } + console.log(`[Logprobs] Logprobs all close to 1: ${logprobsAllCloseTo1}`); + setLabel("logprobs-label", `Logprobs all close to 1: ${logprobsAllCloseTo1}`); + await deleteModel(modelId, appConfig); + return logprobsAllCloseTo1; +} + +async function main() { + const modelId = "Qwen3-0.6B-q0f32-MLC"; + const appConfig = webllm.prebuiltAppConfig; + appConfig.useIndexedDBCache = true; + setLabel("gpu-test-label", "Running tests..."); + let passed = 0, + total = 0; + + if (await testLogitProcessor(modelId, appConfig)) passed++; + total++; + if (await testLogitBias(modelId, appConfig)) passed++; + total++; + if (await testPenalties(modelId, appConfig)) passed++; + total++; + if (await testLogprobs(modelId, appConfig)) passed++; + total++; + + setLabel( + "gpu-test-label", + `GPU sampleTokenFromLogits tests: ${passed}/${total} passed.`, + ); + setLabel( + "gpu-test-label", + `Tests complete. Model deleted. ${passed}/${total} passed.`, + ); +} + +main(); From d7147365c0c76e0d9c44d01c1ee7794f3388aec2 Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Tue, 9 Sep 2025 23:57:17 -0400 Subject: [PATCH 09/14] Update documentation for repetition_penalty --- src/openai_api_protocols/chat_completion.ts | 6 +++--- src/openai_api_protocols/completion.ts | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/openai_api_protocols/chat_completion.ts b/src/openai_api_protocols/chat_completion.ts index e1ee031a..e50352d6 100644 --- a/src/openai_api_protocols/chat_completion.ts +++ b/src/openai_api_protocols/chat_completion.ts @@ -126,9 +126,9 @@ export interface ChatCompletionRequestBase { presence_penalty?: number | null; /** - * Number greater than or equal to 1.0. Values greater than 1.0 discourage - * the model from repeating tokens that have already been generated. Repetition - * penalty is like presence penalty but is multiplicative. + * Penalizes new tokens based on whether they appear in the prompt and the + * generated text so far. Values greater than 1.0 encourage the model to use new + * tokens, while values less than 1.0 encourage the model to repeat tokens. */ repetition_penalty?: number | null; diff --git a/src/openai_api_protocols/completion.ts b/src/openai_api_protocols/completion.ts index 54fcc34f..0534fe93 100644 --- a/src/openai_api_protocols/completion.ts +++ b/src/openai_api_protocols/completion.ts @@ -138,9 +138,9 @@ export interface CompletionCreateParamsBase { presence_penalty?: number | null; /** - * Number greater than or equal to 1.0. Values greater than 1.0 discourage - * the model from repeating tokens that have already been generated. Repetition - * penalty is like presence penalty but is multiplicative. + * Penalizes new tokens based on whether they appear in the prompt and the + * generated text so far. Values greater than 1.0 encourage the model to use new + * tokens, while values less than 1.0 encourage the model to repeat tokens. */ repetition_penalty?: number | null; From 391254a8907b5e19911a2afae4f2b51fe47b6669 Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Sat, 13 Sep 2025 02:08:06 -0400 Subject: [PATCH 10/14] Clean up code and rename sanity-checks --- .gitignore | 1 - .../src/get_started_latency_breakdown.ts | 31 +------------------ .../scripts/sanity_checks}/README.md | 0 .../scripts/sanity_checks}/package.json | 6 ++-- .../scripts/sanity_checks}/sanity_checks.html | 0 .../scripts/sanity_checks}/sanity_checks.ts | 3 -- 6 files changed, 4 insertions(+), 37 deletions(-) rename examples/{sanity-checks => tests/scripts/sanity_checks}/README.md (100%) rename examples/{sanity-checks => tests/scripts/sanity_checks}/package.json (66%) rename examples/{sanity-checks/src => tests/scripts/sanity_checks}/sanity_checks.html (100%) rename examples/{sanity-checks/src => tests/scripts/sanity_checks}/sanity_checks.ts (98%) diff --git a/.gitignore b/.gitignore index 8de96b40..adc767f8 100644 --- a/.gitignore +++ b/.gitignore @@ -324,5 +324,4 @@ node_modules lib .parcel-cache -examples/tests **/.next \ No newline at end of file diff --git a/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts b/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts index 26d9565e..104af5da 100644 --- a/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts +++ b/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts @@ -45,7 +45,6 @@ async function main() { setLabel("init-label", report.text); }; // Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts` - // const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC"; const selectedModel = "Qwen3-0.6B-q0f32-MLC"; const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine( selectedModel, @@ -61,34 +60,6 @@ async function main() { }, ); - // Option 2: Specify your own model other than the prebuilt ones - // const appConfig: webllm.AppConfig = { - // model_list: [ - // { - // model: "https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC", - // model_id: "Llama-3.1-8B-Instruct-q4f32_1-MLC", - // model_lib: - // webllm.modelLibURLPrefix + - // webllm.modelVersion + - // "/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm", - // overrides: { - // context_window_size: 2048, - // }, - // }, - // ], - // }; - // const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine( - // selectedModel, - // { appConfig: appConfig, initProgressCallback: initProgressCallback }, - // ); - - // Option 3: Instantiate MLCEngine() and call reload() separately - // const engine: webllm.MLCEngineInterface = new webllm.MLCEngine({ - // appConfig: appConfig, // if do not specify, we use webllm.prebuiltAppConfig - // initProgressCallback: initProgressCallback, - // }); - // await engine.reload(selectedModel); - const latencyBreakdown: LatencyBreakdown = { logitProcessorTime: [], logitBiasTime: [], @@ -97,7 +68,7 @@ async function main() { totalTime: [], grammarBitmaskTime: [], }; - // want decode_tokens_per_s, e2e_latency_s, time_per_output_token_s, completion_tokens + const decodeTokensPerS: number[] = []; const completionTokens: number[] = []; const e2eLatencyS: number[] = []; diff --git a/examples/sanity-checks/README.md b/examples/tests/scripts/sanity_checks/README.md similarity index 100% rename from examples/sanity-checks/README.md rename to examples/tests/scripts/sanity_checks/README.md diff --git a/examples/sanity-checks/package.json b/examples/tests/scripts/sanity_checks/package.json similarity index 66% rename from examples/sanity-checks/package.json rename to examples/tests/scripts/sanity_checks/package.json index b40e6464..aefb0f05 100644 --- a/examples/sanity-checks/package.json +++ b/examples/tests/scripts/sanity_checks/package.json @@ -1,10 +1,10 @@ { - "name": "sanity-checks", + "name": "tests", "version": "0.1.0", "private": true, "scripts": { - "start": "parcel src/sanity_checks.html --port 8889", - "build": "parcel build src/sanity_checks.html --dist-dir lib" + "start": "parcel sanity_checks.html --port 8889", + "build": "parcel build sanity_checks.html --dist-dir lib" }, "devDependencies": { "buffer": "^5.7.1", diff --git a/examples/sanity-checks/src/sanity_checks.html b/examples/tests/scripts/sanity_checks/sanity_checks.html similarity index 100% rename from examples/sanity-checks/src/sanity_checks.html rename to examples/tests/scripts/sanity_checks/sanity_checks.html diff --git a/examples/sanity-checks/src/sanity_checks.ts b/examples/tests/scripts/sanity_checks/sanity_checks.ts similarity index 98% rename from examples/sanity-checks/src/sanity_checks.ts rename to examples/tests/scripts/sanity_checks/sanity_checks.ts index b0f51d1b..da842353 100644 --- a/examples/sanity-checks/src/sanity_checks.ts +++ b/examples/tests/scripts/sanity_checks/sanity_checks.ts @@ -71,7 +71,6 @@ async function testLogitProcessor( async function testLogitBias(modelId: string, appConfig: webllm.AppConfig) { // Set logit_bias to strongly favor token 0 const prompt = "Test logit bias."; - // const t0 = performance.now(); const engine: webllm.MLCEngineInterface = await createEngine( modelId, appConfig, @@ -128,7 +127,6 @@ async function testPenalties(modelId: string, appConfig: webllm.AppConfig) { async function testLogprobs(modelId: string, appConfig: webllm.AppConfig) { // Test logprobs: check that logprobs are returned and sum to ~1 after exp const prompt = "Test logprobs."; - const t0 = performance.now(); const engine: webllm.MLCEngineInterface = await createEngine( modelId, appConfig, @@ -140,7 +138,6 @@ async function testLogprobs(modelId: string, appConfig: webllm.AppConfig) { logprobs: true, top_logprobs: 5, }); - const t1 = performance.now(); const logprobs = reply.choices[0]?.logprobs; let logprobsAllCloseTo1 = true; From aa2422d9dc123a2767a9823323ef59ced567e84b Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Sat, 13 Sep 2025 02:12:40 -0400 Subject: [PATCH 11/14] Update sanity_checks package.json --- examples/tests/scripts/sanity_checks/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tests/scripts/sanity_checks/package.json b/examples/tests/scripts/sanity_checks/package.json index aefb0f05..cf86fb3d 100644 --- a/examples/tests/scripts/sanity_checks/package.json +++ b/examples/tests/scripts/sanity_checks/package.json @@ -1,5 +1,5 @@ { - "name": "tests", + "name": "sanity_checks", "version": "0.1.0", "private": true, "scripts": { From 891dc78768186d421ba0f8f2530fe1f54c1de1f3 Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Sat, 13 Sep 2025 02:20:21 -0400 Subject: [PATCH 12/14] Use Prettier code style --- .../get-started-latency-breakdown/README.md | 2 +- .../scripts/sanity_checks/sanity_checks.html | 42 ++++++++++++------- src/llm_chat.ts | 1 - 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/examples/get-started-latency-breakdown/README.md b/examples/get-started-latency-breakdown/README.md index 0a74ea26..2a8f6967 100644 --- a/examples/get-started-latency-breakdown/README.md +++ b/examples/get-started-latency-breakdown/README.md @@ -1,6 +1,6 @@ # WebLLM Get Started App -This folder provides a minimum demo to show WebLLM API in a webapp setting with +This folder provides a minimum demo to show WebLLM API in a webapp setting with collection of latency statistics for individual token sampling steps. To try it out, you can do the following steps under this folder diff --git a/examples/tests/scripts/sanity_checks/sanity_checks.html b/examples/tests/scripts/sanity_checks/sanity_checks.html index bfbfc869..2b662f71 100644 --- a/examples/tests/scripts/sanity_checks/sanity_checks.html +++ b/examples/tests/scripts/sanity_checks/sanity_checks.html @@ -1,17 +1,31 @@ - + - + GPU sampleTokenFromLogits Tests - - + +

GPU sampleTokenFromLogits Tests

Overall:
@@ -25,11 +39,11 @@

GPU sampleTokenFromLogits Tests

Logprobs:
- + diff --git a/src/llm_chat.ts b/src/llm_chat.ts index 7266b5e4..8f7620a8 100644 --- a/src/llm_chat.ts +++ b/src/llm_chat.ts @@ -1276,7 +1276,6 @@ export class LLMChatPipeline { temperature = Math.max(1e-6, temperature); // to prevent division by zero const numSeqs = 1; - const numTokens = this.appearedTokensFreq.size; const temperatures = new Float32Array([temperature]); From ce95b079e761bf83886b21615c72793cc812ec5a Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Sat, 13 Sep 2025 02:30:15 -0400 Subject: [PATCH 13/14] Update .gitignore with package-lock and move sanity checks --- .gitignore | 1 + {examples/tests => tests}/scripts/sanity_checks/README.md | 0 {examples/tests => tests}/scripts/sanity_checks/package.json | 0 .../tests => tests}/scripts/sanity_checks/sanity_checks.html | 0 {examples/tests => tests}/scripts/sanity_checks/sanity_checks.ts | 0 5 files changed, 1 insertion(+) rename {examples/tests => tests}/scripts/sanity_checks/README.md (100%) rename {examples/tests => tests}/scripts/sanity_checks/package.json (100%) rename {examples/tests => tests}/scripts/sanity_checks/sanity_checks.html (100%) rename {examples/tests => tests}/scripts/sanity_checks/sanity_checks.ts (100%) diff --git a/.gitignore b/.gitignore index adc767f8..2dbbf58f 100644 --- a/.gitignore +++ b/.gitignore @@ -323,5 +323,6 @@ tvm_home node_modules lib .parcel-cache +package-lock.json **/.next \ No newline at end of file diff --git a/examples/tests/scripts/sanity_checks/README.md b/tests/scripts/sanity_checks/README.md similarity index 100% rename from examples/tests/scripts/sanity_checks/README.md rename to tests/scripts/sanity_checks/README.md diff --git a/examples/tests/scripts/sanity_checks/package.json b/tests/scripts/sanity_checks/package.json similarity index 100% rename from examples/tests/scripts/sanity_checks/package.json rename to tests/scripts/sanity_checks/package.json diff --git a/examples/tests/scripts/sanity_checks/sanity_checks.html b/tests/scripts/sanity_checks/sanity_checks.html similarity index 100% rename from examples/tests/scripts/sanity_checks/sanity_checks.html rename to tests/scripts/sanity_checks/sanity_checks.html diff --git a/examples/tests/scripts/sanity_checks/sanity_checks.ts b/tests/scripts/sanity_checks/sanity_checks.ts similarity index 100% rename from examples/tests/scripts/sanity_checks/sanity_checks.ts rename to tests/scripts/sanity_checks/sanity_checks.ts From fa4c80f35bc6b44e280f3560899279b7064f8277 Mon Sep 17 00:00:00 2001 From: akaashrp <43900735+akaashrp@users.noreply.github.com> Date: Sat, 13 Sep 2025 02:35:24 -0400 Subject: [PATCH 14/14] Added tests-specific .gitignore --- .gitignore | 1 - tests/.gitignore | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 tests/.gitignore diff --git a/.gitignore b/.gitignore index 2dbbf58f..adc767f8 100644 --- a/.gitignore +++ b/.gitignore @@ -323,6 +323,5 @@ tvm_home node_modules lib .parcel-cache -package-lock.json **/.next \ No newline at end of file diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 00000000..d8b83df9 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1 @@ +package-lock.json