From 5fabf10a4ba37599b4e8f8b103d8f5ed6870dbdd Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Tue, 3 Jun 2025 21:16:13 -0400
Subject: [PATCH 01/14] replace calls in sampleTokenFromLogits with GPU kernels

---
 src/llm_chat.ts | 365 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 310 insertions(+), 55 deletions(-)

diff --git a/src/llm_chat.ts b/src/llm_chat.ts
index 5f8ecf00..8f052fab 100644
--- a/src/llm_chat.ts
+++ b/src/llm_chat.ts
@@ -50,6 +50,12 @@ export class LLMChatPipeline {
   private image_embed: tvmjs.PackedFunc | undefined;
   private embed: tvmjs.PackedFunc;
   private fapplyBitmask: tvmjs.PackedFunc;
+  private fapplyPenalty: tvmjs.PackedFunc;
+  private fapplyLogitBias: tvmjs.PackedFunc;
+  private fsoftmaxWithTemperature: tvmjs.PackedFunc;
+  // private frenormalizeByTopP: tvmjs.PackedFunc; //BatchRenormalizeProbsByTopP
+  // private //BatchSampleTokensImpl, ChunkSampleTokensImpl
+
   // Functions related to PagedKVCache
   private fclearKVCaches: tvmjs.PackedFunc;
   private fKVCacheAddSequence: tvmjs.PackedFunc;
@@ -62,6 +68,7 @@ export class LLMChatPipeline {
   private params: tvmjs.TVMObject;
   private kvCache: tvmjs.TVMObject;
   private logitsOnCPU?: tvmjs.NDArray = undefined;
+  private logitsOnCPUCopy?: tvmjs.NDArray = undefined;
   private filledKVCacheLength = 0;
 
   // meta data
@@ -190,6 +197,15 @@ export class LLMChatPipeline {
     this.fapplyBitmask = this.tvm.detachFromCurrentScope(
       this.vm.getFunction("apply_bitmask_inplace"),
     );
+    this.fapplyPenalty = this.tvm.detachFromCurrentScope(
+      this.vm.getFunction("apply_penalty_inplace"),
+    );
+    this.fapplyLogitBias = this.tvm.detachFromCurrentScope(
+      this.vm.getFunction("apply_logit_bias_inplace"),
+    );
+    this.fsoftmaxWithTemperature = this.tvm.detachFromCurrentScope(
+      this.vm.getFunction("softmax_with_temperature"),
+    );
     try {
       this.image_embed = this.tvm.detachFromCurrentScope(
         this.vm.getFunction("image_embed"),
@@ -302,6 +318,7 @@ export class LLMChatPipeline {
     this.kvCache.dispose();
     this.fclearKVCaches.dispose();
     this.logitsOnCPU?.dispose();
+    this.logitsOnCPUCopy?.dispose();
     this.tvm.dispose();
     this.tokenizer.dispose();
     this.xgTokenizerInfo?.dispose();
@@ -957,6 +974,20 @@ export class LLMChatPipeline {
     return this.logitsOnCPU;
   }
 
+  private updateLogitsOnCPUCopy(logits: tvmjs.NDArray): tvmjs.NDArray {
+    if (this.logitsOnCPUCopy == undefined) {
+      this.logitsOnCPUCopy = this.tvm.detachFromCurrentScope(
+        this.tvm.empty(logits.shape, logits.dtype, this.tvm.cpu()),
+      );
+    } else {
+      if (logits.shape[0] != this.logitsOnCPUCopy.shape[0]) {
+        throw Error("We expect the size of logits to remain unchanged");
+      }
+    }
+    this.logitsOnCPUCopy.copyFrom(logits);
+    return this.logitsOnCPUCopy;
+  }
+
   private async sampleTokenFromLogits(
     logitsOnGPU: tvmjs.NDArray,
     genConfig?: GenerationConfig,
@@ -1091,68 +1122,221 @@ export class LLMChatPipeline {
       if (this.logitProcessor !== undefined) {
         logitsOnCPUArray = this.logitProcessor.processLogits(logitsOnCPUArray);
       }
+
       if (_hasValue(logit_bias)) {
-        for (const tokenID in logit_bias) {
-          const curBias = logit_bias[tokenID];
-          const curTokenID = parseInt(tokenID);
-          if (curTokenID > vocab_size) {
-            throw Error(
-              "Token " +
-                curTokenID +
-                " in logit_bias exceeds vocab_size " +
-                vocab_size,
-            );
-          }
-          logitsOnCPUArray[curTokenID] += curBias;
+        this.tvm.beginScope();
+        const numTokens = Object.keys(logit_bias ?? {}).length;
+        const pos2seq_id = new Int32Array(numTokens);
+        const tokenIds = new Int32Array(numTokens);
+        const tokenLogitBias = new Float32Array(numTokens);
+
+        for (let index = 0; index < numTokens; index++) {
+          pos2seq_id[index] = 0;
+          tokenIds[index] = parseInt(Object.keys(logit_bias ?? {})[index]);
+          tokenLogitBias[index] = logit_bias![tokenIds[index]];
         }
+
+        const pos2seqIdsArray = this.tvm
+          .empty([numTokens], "int32", this.device)
+          .copyFrom(pos2seq_id);
+
+        const tokenIdsArray = this.tvm
+          .empty([numTokens], "int32", this.device)
+          .copyFrom(tokenIds);
+
+        const tokenLogitBiasArray = this.tvm
+          .empty([numTokens], "float32", this.device)
+          .copyFrom(tokenLogitBias);
+
+        const logitsOnGPU = this.tvm
+          .empty([1, this.fullVocabSize], "float32", this.device)
+          .copyFrom(logitsOnCPUArray);
+
+        this.fapplyLogitBias(
+          logitsOnGPU,
+          pos2seqIdsArray,
+          tokenIdsArray,
+          tokenLogitBiasArray,
+        );
+        this.updateLogitsOnCPU(logitsOnGPU);
+        this.tvm.endScope();
       }
-      this.logitsOnCPU.copyFrom(logitsOnCPUArray);
+      await this.device.sync();
+      // console.log("After applying logit bias (GPU):", this.logitsOnCPUCopy?.toArray().slice(0, 20));
+
+      // if (_hasValue(logit_bias)) {
+      //   for (const tokenID in logit_bias) {
+      //     const curBias = logit_bias[tokenID];
+      //     const curTokenID = parseInt(tokenID);
+      //     if (curTokenID > vocab_size) {
+      //       throw Error(
+      //         "Token " +
+      //           curTokenID +
+      //           " in logit_bias exceeds vocab_size " +
+      //           vocab_size,
+      //       );
+      //     }
+      //     logitsOnCPUArray[curTokenID] += curBias;
+      //   }
+      // }
+      // this.logitsOnCPU.copyFrom(logitsOnCPUArray);
+      // console.log("After applying logit bias (CPU):", this.logitsOnCPU.toArray().slice(0, 20));
     }
 
+    // if (JSON.stringify(this.logitsOnCPUCopy?.toArray()) !== JSON.stringify(this.logitsOnCPU.toArray())) {
+    //   throw new Error("Logits on CPU and GPU do not match");
+    // }
+
+    // console.log("Penalties:", {
+    //   frequency_penalty,
+    //   presence_penalty,
+    //   repetition_penalty,
+    // });
+
     // 3. Apply penalties to logits
-    if (_hasValue(frequency_penalty) && _hasValue(presence_penalty)) {
-      // 3.1. Use frequency and presence penalty
+    if (
+      frequency_penalty != 0.0 ||
+      presence_penalty != 0.0 ||
+      repetition_penalty != 1.0
+    ) {
       this.tvm.beginScope();
-      // Both `keys()` and `values()` are in insertion order.
       const appearedTokens = [...this.appearedTokensFreq.keys()];
       const appearedTokensFreqs = [...this.appearedTokensFreq.values()];
-      const appeared_tokens_ndarray = this.tvm.empty(
-        [1, appearedTokens.length],
-        "int32",
-        this.tvm.cpu(),
-      );
-      const appeared_tokens_freqs_ndarray = this.tvm.empty(
-        [1, appearedTokensFreqs.length],
-        "int32",
-        this.tvm.cpu(),
-      );
-      appeared_tokens_ndarray.copyFrom(appearedTokens);
-      appeared_tokens_freqs_ndarray.copyFrom(appearedTokensFreqs);
-      this.tvm.applyPresenceAndFrequencyPenalty(
-        this.logitsOnCPU,
-        appeared_tokens_ndarray,
-        appeared_tokens_freqs_ndarray,
-        presence_penalty!,
-        frequency_penalty!,
-      );
-      this.tvm.endScope();
-    } else if (repetition_penalty != 1.0) {
-      // 3.2. Use repetition penalty
-      this.tvm.beginScope();
-      const appearedTokens = [...this.appearedTokensFreq.keys()];
-      const appeared_tokens_ndarray = this.tvm.empty(
-        [1, appearedTokens.length],
-        "int32",
-        this.tvm.cpu(),
-      );
-      appeared_tokens_ndarray.copyFrom(appearedTokens);
-      this.tvm.applyRepetitionPenalty(
-        this.logitsOnCPU,
-        appeared_tokens_ndarray,
+
+      const numTokens = appearedTokens.length;
+      // const paddedNumTokens = Math.ceil(numTokens / 4) * 4;
+
+      const seqIdsArray = this.tvm
+        .empty([1], "int32", this.device)
+        .copyFrom([0]);
+
+      const pos2seq_id = new Int32Array(numTokens).fill(0);
+      const tokenIds = new Int32Array(numTokens).fill(0);
+      const tokenCnt = new Int32Array(numTokens).fill(0);
+      const penalties = new Float32Array([
+        presence_penalty,
+        frequency_penalty,
         repetition_penalty,
-      );
+      ]);
+      const paddedPenalties = new Float32Array(3);
+      paddedPenalties.set(penalties);
+
+      for (let index = 0; index < numTokens; index++) {
+        pos2seq_id[index] = 0;
+        tokenIds[index] = appearedTokens[index];
+        tokenCnt[index] = appearedTokensFreqs[index];
+      }
+
+      const pos2seqIdsArray = this.tvm
+        .empty([numTokens], "int32", this.device)
+        .copyFrom(pos2seq_id);
+
+      const tokenIdsArray = this.tvm
+        .empty([numTokens], "int32", this.device)
+        .copyFrom(tokenIds);
+
+      const tokenCntArray = this.tvm
+        .empty([numTokens], "int32", this.device)
+        .copyFrom(tokenCnt);
+
+      const penaltiesArray = this.tvm
+        .empty([1, 3], "float32", this.device)
+        .copyFrom(paddedPenalties);
+
+      const logitsOnGPU = this.tvm
+        .empty([1, this.fullVocabSize], "float32", this.device)
+        .copyFrom(this.logitsOnCPU.toArray());
+
+      // console.log("logitsOnGPU shape:", logitsOnGPU.shape);
+      // console.log("seqIdsArray shape:", seqIdsArray.shape);
+      // console.log("pos2seqIdsArray shape:", pos2seqIdsArray.shape);
+      // console.log("tokenIdsArray shape:", tokenIdsArray.shape);
+      // console.log("tokenCntArray shape:", tokenCntArray.shape);
+      // console.log("penaltiesArray shape:", penaltiesArray.shape);
+
+      if (numTokens > 0) {
+        this.fapplyPenalty(
+          logitsOnGPU,
+          seqIdsArray,
+          pos2seqIdsArray,
+          tokenIdsArray,
+          tokenCntArray,
+          penaltiesArray,
+        );
+      }
+      this.updateLogitsOnCPU(logitsOnGPU);
       this.tvm.endScope();
     }
+    await this.device.sync();
+    // console.log("After applying penalties (GPU):", this.logitsOnCPUCopy?.toArray().slice(0, 20));
+
+    // if (_hasValue(frequency_penalty) && _hasValue(presence_penalty)) {
+    //   // 3.1. Use frequency and presence penalty
+    //   this.tvm.beginScope();
+    //   // Both `keys()` and `values()` are in insertion order.
+    //   const appearedTokens = [...this.appearedTokensFreq.keys()];
+    //   const appearedTokensFreqs = [...this.appearedTokensFreq.values()];
+    //   const appeared_tokens_ndarray = this.tvm.empty(
+    //     [1, appearedTokens.length],
+    //     "int32",
+    //     this.tvm.cpu(),
+    //   );
+    //   const appeared_tokens_freqs_ndarray = this.tvm.empty(
+    //     [1, appearedTokensFreqs.length],
+    //     "int32",
+    //     this.tvm.cpu(),
+    //   );
+    //   appeared_tokens_ndarray.copyFrom(appearedTokens);
+    //   appeared_tokens_freqs_ndarray.copyFrom(appearedTokensFreqs);
+    //   // let logitsOnCPUBefore = this.logitsOnCPU.toArray();
+    //   this.tvm.applyPresenceAndFrequencyPenalty(
+    //     this.logitsOnCPU,
+    //     appeared_tokens_ndarray,
+    //     appeared_tokens_freqs_ndarray,
+    //     presence_penalty!,
+    //     frequency_penalty!,
+    //   );
+    //   // if (
+    //   //   JSON.stringify(logitsOnCPUBefore) ===
+    //   //   JSON.stringify(this.logitsOnCPU.toArray())
+    //   // ) {
+    //   //   console.log("No penalty applied");
+    //   // }
+    //   this.tvm.endScope();
+    // } else if (repetition_penalty != 1.0) {
+    //   // 3.2. Use repetition penalty
+    //   this.tvm.beginScope();
+    //   const appearedTokens = [...this.appearedTokensFreq.keys()];
+    //   const appeared_tokens_ndarray = this.tvm.empty(
+    //     [1, appearedTokens.length],
+    //     "int32",
+    //     this.tvm.cpu(),
+    //   );
+    //   appeared_tokens_ndarray.copyFrom(appearedTokens);
+    //   this.tvm.applyRepetitionPenalty(
+    //     this.logitsOnCPU,
+    //     appeared_tokens_ndarray,
+    //     repetition_penalty,
+    //   );
+    //   this.tvm.endScope();
+    // }
+    // // console.log("After applying penalties (CPU):", this.logitsOnCPU.toArray().slice(0, 20));
+
+    // if (this.logitsOnCPUCopy) {
+    //   console.log("logitsOnCPUCopy shape:", this.logitsOnCPUCopy.shape);
+    //   const logitsOnCPUArray = this.logitsOnCPU.toArray();
+    //   const logitsOnCPUCopyArray = this.logitsOnCPUCopy.toArray();
+    //   let flag = true;
+    //   for (let i = 0; i < logitsOnCPUArray.length; i++) {
+    //     if (Math.abs(logitsOnCPUArray[i] - logitsOnCPUCopyArray[i]) > 1e-6) {
+    //       console.error(`Mismatch at index ${i}: CPU=${logitsOnCPUArray[i]}, GPU=${logitsOnCPUCopyArray[i]}`);
+    //       flag = false;
+    //     }
+    //   }
+    //   if (!flag) {
+    //     throw new Error("Logits on CPU and GPU do not match within tolerance");
+    //   }
+    // }
 
     // 4. Sample token from logits
     // If logprobs, need the actual distribution via softmax, otherwise directly sample from logits
@@ -1160,11 +1344,82 @@ export class LLMChatPipeline {
     if (logprobs) {
       // Inplace transform logitsOnCPU to a distribution
       temperature = Math.max(1e-6, temperature); // to prevent division by zero
-      this.tvm.applySoftmaxWithTemperature(this.logitsOnCPU, temperature);
-      sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU, top_p);
-      this.tokenLogprobArray.push(
-        this.getTokenLogprob(sampledToken, top_logprobs!),
-      );
+
+      if (this.logitsOnCPU.shape[2] !== this.fullVocabSize) {
+        throw new Error("Logits vocab size does not match full vocab size");
+      }
+
+      // this.tvm.beginScope();
+      // const testArray = new Float32Array(this.fullVocabSize);
+      // for (let i = 0; i < this.fullVocabSize; i++) {
+      //   testArray[i] = Math.random() * 10;
+      // }
+      // console.log("Test array:", testArray.slice(0, 20));
+      // const detachedTestArrayOnGPU = this.tvm.detachFromCurrentScope(
+      //   this.tvm.empty([1, 1, this.fullVocabSize], "float32", this.device).copyFrom(testArray)
+      // );
+      // await this.device.sync();
+      // const detachedTestArrayOnCPU = this.tvm.detachFromCurrentScope(
+      //   this.tvm.empty([testArray.length], "float32", this.tvm.cpu()).copyFrom(testArray)
+      // );
+      // this.tvm.endScope();
+
+      const numSeqs = 1;
+      const numTokens = this.appearedTokensFreq.size;
+
+      if (numTokens > 0) {
+        const temperatures = new Float32Array([temperature]);
+
+        this.tvm.beginScope();
+        const temperaturesArray = this.tvm
+          .empty([numSeqs], "float32", this.device)
+          .copyFrom(temperatures);
+
+        const logitsOnGPU = this.tvm
+          .empty([numSeqs, 1, this.fullVocabSize], "float32", this.device)
+          .copyFrom(this.logitsOnCPU.toArray());
+
+        // const detachedTestArrayOnGPUCopy = this.tvm
+        //   .empty(detachedTestArrayOnGPU.shape, detachedTestArrayOnGPU.dtype, this.tvm.cpu())
+        //   .copyFrom(detachedTestArrayOnGPU);
+
+        // await this.device.sync();
+
+        const probs = this.fsoftmaxWithTemperature(
+          logitsOnGPU,
+          temperaturesArray,
+        );
+        this.updateLogitsOnCPU(probs);
+        this.tvm.endScope();
+        await this.device.sync();
+
+        sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU, top_p);
+        this.tokenLogprobArray.push(
+          this.getTokenLogprob(sampledToken, top_logprobs!),
+        );
+      } else {
+        this.tvm.applySoftmaxWithTemperature(this.logitsOnCPU, temperature);
+        sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU, top_p);
+        this.tokenLogprobArray.push(
+          this.getTokenLogprob(sampledToken, top_logprobs!),
+        );
+      }
+
+      // if (numTokens > 0 && this.logitsOnCPUCopy) {
+      //   console.log("logitsOnCPUCopy shape:", this.logitsOnCPUCopy.shape);
+      //   const logitsOnCPUArray = this.logitsOnCPU.toArray();
+      //   const logitsOnCPUCopyArray = this.logitsOnCPUCopy.toArray();
+      //   let flag = true;
+      //   for (let i = 0; i < logitsOnCPUArray.length; i++) {
+      //     if (Math.abs(logitsOnCPUArray[i] - logitsOnCPUCopyArray[i]) > 1e-6) {
+      //       console.error(`Mismatch at index ${i}: CPU=${logitsOnCPUArray[i]}, GPU=${logitsOnCPUCopyArray[i]}`);
+      //       flag = false;
+      //     }
+      //   }
+      //   if (!flag) {
+      //     throw new Error("Logits on CPU and GPU do not match within tolerance");
+      //   }
+      // }
     } else {
       // temperature being 0 is allowed here, equivalent to argmax
       sampledToken = this.tvm.sampleTopPFromLogits(

From 2777057780b16610eb11cf24a6f9cb1bc0958d64 Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Tue, 3 Jun 2025 21:28:52 -0400
Subject: [PATCH 02/14] remove debugging code

---
 src/llm_chat.ts | 147 ------------------------------------------------
 1 file changed, 147 deletions(-)

diff --git a/src/llm_chat.ts b/src/llm_chat.ts
index 8f052fab..3b3be448 100644
--- a/src/llm_chat.ts
+++ b/src/llm_chat.ts
@@ -1162,37 +1162,8 @@ export class LLMChatPipeline {
         this.tvm.endScope();
       }
       await this.device.sync();
-      // console.log("After applying logit bias (GPU):", this.logitsOnCPUCopy?.toArray().slice(0, 20));
-
-      // if (_hasValue(logit_bias)) {
-      //   for (const tokenID in logit_bias) {
-      //     const curBias = logit_bias[tokenID];
-      //     const curTokenID = parseInt(tokenID);
-      //     if (curTokenID > vocab_size) {
-      //       throw Error(
-      //         "Token " +
-      //           curTokenID +
-      //           " in logit_bias exceeds vocab_size " +
-      //           vocab_size,
-      //       );
-      //     }
-      //     logitsOnCPUArray[curTokenID] += curBias;
-      //   }
-      // }
-      // this.logitsOnCPU.copyFrom(logitsOnCPUArray);
-      // console.log("After applying logit bias (CPU):", this.logitsOnCPU.toArray().slice(0, 20));
     }
 
-    // if (JSON.stringify(this.logitsOnCPUCopy?.toArray()) !== JSON.stringify(this.logitsOnCPU.toArray())) {
-    //   throw new Error("Logits on CPU and GPU do not match");
-    // }
-
-    // console.log("Penalties:", {
-    //   frequency_penalty,
-    //   presence_penalty,
-    //   repetition_penalty,
-    // });
-
     // 3. Apply penalties to logits
     if (
       frequency_penalty != 0.0 ||
@@ -1204,7 +1175,6 @@ export class LLMChatPipeline {
       const appearedTokensFreqs = [...this.appearedTokensFreq.values()];
 
       const numTokens = appearedTokens.length;
-      // const paddedNumTokens = Math.ceil(numTokens / 4) * 4;
 
       const seqIdsArray = this.tvm
         .empty([1], "int32", this.device)
@@ -1247,13 +1217,6 @@ export class LLMChatPipeline {
         .empty([1, this.fullVocabSize], "float32", this.device)
         .copyFrom(this.logitsOnCPU.toArray());
 
-      // console.log("logitsOnGPU shape:", logitsOnGPU.shape);
-      // console.log("seqIdsArray shape:", seqIdsArray.shape);
-      // console.log("pos2seqIdsArray shape:", pos2seqIdsArray.shape);
-      // console.log("tokenIdsArray shape:", tokenIdsArray.shape);
-      // console.log("tokenCntArray shape:", tokenCntArray.shape);
-      // console.log("penaltiesArray shape:", penaltiesArray.shape);
-
       if (numTokens > 0) {
         this.fapplyPenalty(
           logitsOnGPU,
@@ -1268,75 +1231,6 @@ export class LLMChatPipeline {
       this.tvm.endScope();
     }
     await this.device.sync();
-    // console.log("After applying penalties (GPU):", this.logitsOnCPUCopy?.toArray().slice(0, 20));
-
-    // if (_hasValue(frequency_penalty) && _hasValue(presence_penalty)) {
-    //   // 3.1. Use frequency and presence penalty
-    //   this.tvm.beginScope();
-    //   // Both `keys()` and `values()` are in insertion order.
-    //   const appearedTokens = [...this.appearedTokensFreq.keys()];
-    //   const appearedTokensFreqs = [...this.appearedTokensFreq.values()];
-    //   const appeared_tokens_ndarray = this.tvm.empty(
-    //     [1, appearedTokens.length],
-    //     "int32",
-    //     this.tvm.cpu(),
-    //   );
-    //   const appeared_tokens_freqs_ndarray = this.tvm.empty(
-    //     [1, appearedTokensFreqs.length],
-    //     "int32",
-    //     this.tvm.cpu(),
-    //   );
-    //   appeared_tokens_ndarray.copyFrom(appearedTokens);
-    //   appeared_tokens_freqs_ndarray.copyFrom(appearedTokensFreqs);
-    //   // let logitsOnCPUBefore = this.logitsOnCPU.toArray();
-    //   this.tvm.applyPresenceAndFrequencyPenalty(
-    //     this.logitsOnCPU,
-    //     appeared_tokens_ndarray,
-    //     appeared_tokens_freqs_ndarray,
-    //     presence_penalty!,
-    //     frequency_penalty!,
-    //   );
-    //   // if (
-    //   //   JSON.stringify(logitsOnCPUBefore) ===
-    //   //   JSON.stringify(this.logitsOnCPU.toArray())
-    //   // ) {
-    //   //   console.log("No penalty applied");
-    //   // }
-    //   this.tvm.endScope();
-    // } else if (repetition_penalty != 1.0) {
-    //   // 3.2. Use repetition penalty
-    //   this.tvm.beginScope();
-    //   const appearedTokens = [...this.appearedTokensFreq.keys()];
-    //   const appeared_tokens_ndarray = this.tvm.empty(
-    //     [1, appearedTokens.length],
-    //     "int32",
-    //     this.tvm.cpu(),
-    //   );
-    //   appeared_tokens_ndarray.copyFrom(appearedTokens);
-    //   this.tvm.applyRepetitionPenalty(
-    //     this.logitsOnCPU,
-    //     appeared_tokens_ndarray,
-    //     repetition_penalty,
-    //   );
-    //   this.tvm.endScope();
-    // }
-    // // console.log("After applying penalties (CPU):", this.logitsOnCPU.toArray().slice(0, 20));
-
-    // if (this.logitsOnCPUCopy) {
-    //   console.log("logitsOnCPUCopy shape:", this.logitsOnCPUCopy.shape);
-    //   const logitsOnCPUArray = this.logitsOnCPU.toArray();
-    //   const logitsOnCPUCopyArray = this.logitsOnCPUCopy.toArray();
-    //   let flag = true;
-    //   for (let i = 0; i < logitsOnCPUArray.length; i++) {
-    //     if (Math.abs(logitsOnCPUArray[i] - logitsOnCPUCopyArray[i]) > 1e-6) {
-    //       console.error(`Mismatch at index ${i}: CPU=${logitsOnCPUArray[i]}, GPU=${logitsOnCPUCopyArray[i]}`);
-    //       flag = false;
-    //     }
-    //   }
-    //   if (!flag) {
-    //     throw new Error("Logits on CPU and GPU do not match within tolerance");
-    //   }
-    // }
 
     // 4. Sample token from logits
     // If logprobs, need the actual distribution via softmax, otherwise directly sample from logits
@@ -1345,25 +1239,6 @@ export class LLMChatPipeline {
       // Inplace transform logitsOnCPU to a distribution
       temperature = Math.max(1e-6, temperature); // to prevent division by zero
 
-      if (this.logitsOnCPU.shape[2] !== this.fullVocabSize) {
-        throw new Error("Logits vocab size does not match full vocab size");
-      }
-
-      // this.tvm.beginScope();
-      // const testArray = new Float32Array(this.fullVocabSize);
-      // for (let i = 0; i < this.fullVocabSize; i++) {
-      //   testArray[i] = Math.random() * 10;
-      // }
-      // console.log("Test array:", testArray.slice(0, 20));
-      // const detachedTestArrayOnGPU = this.tvm.detachFromCurrentScope(
-      //   this.tvm.empty([1, 1, this.fullVocabSize], "float32", this.device).copyFrom(testArray)
-      // );
-      // await this.device.sync();
-      // const detachedTestArrayOnCPU = this.tvm.detachFromCurrentScope(
-      //   this.tvm.empty([testArray.length], "float32", this.tvm.cpu()).copyFrom(testArray)
-      // );
-      // this.tvm.endScope();
-
       const numSeqs = 1;
       const numTokens = this.appearedTokensFreq.size;
 
@@ -1379,12 +1254,6 @@ export class LLMChatPipeline {
           .empty([numSeqs, 1, this.fullVocabSize], "float32", this.device)
           .copyFrom(this.logitsOnCPU.toArray());
 
-        // const detachedTestArrayOnGPUCopy = this.tvm
-        //   .empty(detachedTestArrayOnGPU.shape, detachedTestArrayOnGPU.dtype, this.tvm.cpu())
-        //   .copyFrom(detachedTestArrayOnGPU);
-
-        // await this.device.sync();
-
         const probs = this.fsoftmaxWithTemperature(
           logitsOnGPU,
           temperaturesArray,
@@ -1404,22 +1273,6 @@ export class LLMChatPipeline {
           this.getTokenLogprob(sampledToken, top_logprobs!),
         );
       }
-
-      // if (numTokens > 0 && this.logitsOnCPUCopy) {
-      //   console.log("logitsOnCPUCopy shape:", this.logitsOnCPUCopy.shape);
-      //   const logitsOnCPUArray = this.logitsOnCPU.toArray();
-      //   const logitsOnCPUCopyArray = this.logitsOnCPUCopy.toArray();
-      //   let flag = true;
-      //   for (let i = 0; i < logitsOnCPUArray.length; i++) {
-      //     if (Math.abs(logitsOnCPUArray[i] - logitsOnCPUCopyArray[i]) > 1e-6) {
-      //       console.error(`Mismatch at index ${i}: CPU=${logitsOnCPUArray[i]}, GPU=${logitsOnCPUCopyArray[i]}`);
-      //       flag = false;
-      //     }
-      //   }
-      //   if (!flag) {
-      //     throw new Error("Logits on CPU and GPU do not match within tolerance");
-      //   }
-      // }
     } else {
       // temperature being 0 is allowed here, equivalent to argmax
       sampledToken = this.tvm.sampleTopPFromLogits(

From dee9d19e438afe290ca6d0a3a81584e42cdc84a7 Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Fri, 6 Jun 2025 22:59:04 -0400
Subject: [PATCH 03/14] remove unnecessary loops

---
 src/llm_chat.ts | 34 +++++++---------------------------
 1 file changed, 7 insertions(+), 27 deletions(-)

diff --git a/src/llm_chat.ts b/src/llm_chat.ts
index 3b3be448..2c7d62a6 100644
--- a/src/llm_chat.ts
+++ b/src/llm_chat.ts
@@ -53,8 +53,6 @@ export class LLMChatPipeline {
   private fapplyPenalty: tvmjs.PackedFunc;
   private fapplyLogitBias: tvmjs.PackedFunc;
   private fsoftmaxWithTemperature: tvmjs.PackedFunc;
-  // private frenormalizeByTopP: tvmjs.PackedFunc; //BatchRenormalizeProbsByTopP
-  // private //BatchSampleTokensImpl, ChunkSampleTokensImpl
 
   // Functions related to PagedKVCache
   private fclearKVCaches: tvmjs.PackedFunc;
@@ -68,7 +66,6 @@ export class LLMChatPipeline {
   private params: tvmjs.TVMObject;
   private kvCache: tvmjs.TVMObject;
   private logitsOnCPU?: tvmjs.NDArray = undefined;
-  private logitsOnCPUCopy?: tvmjs.NDArray = undefined;
   private filledKVCacheLength = 0;
 
   // meta data
@@ -318,7 +315,6 @@ export class LLMChatPipeline {
     this.kvCache.dispose();
     this.fclearKVCaches.dispose();
     this.logitsOnCPU?.dispose();
-    this.logitsOnCPUCopy?.dispose();
     this.tvm.dispose();
     this.tokenizer.dispose();
     this.xgTokenizerInfo?.dispose();
@@ -974,20 +970,6 @@ export class LLMChatPipeline {
     return this.logitsOnCPU;
   }
 
-  private updateLogitsOnCPUCopy(logits: tvmjs.NDArray): tvmjs.NDArray {
-    if (this.logitsOnCPUCopy == undefined) {
-      this.logitsOnCPUCopy = this.tvm.detachFromCurrentScope(
-        this.tvm.empty(logits.shape, logits.dtype, this.tvm.cpu()),
-      );
-    } else {
-      if (logits.shape[0] != this.logitsOnCPUCopy.shape[0]) {
-        throw Error("We expect the size of logits to remain unchanged");
-      }
-    }
-    this.logitsOnCPUCopy.copyFrom(logits);
-    return this.logitsOnCPUCopy;
-  }
-
   private async sampleTokenFromLogits(
     logitsOnGPU: tvmjs.NDArray,
     genConfig?: GenerationConfig,
@@ -1126,14 +1108,15 @@ export class LLMChatPipeline {
       if (_hasValue(logit_bias)) {
         this.tvm.beginScope();
         const numTokens = Object.keys(logit_bias ?? {}).length;
-        const pos2seq_id = new Int32Array(numTokens);
+        const pos2seq_id = new Int32Array(numTokens).fill(0);
         const tokenIds = new Int32Array(numTokens);
         const tokenLogitBias = new Float32Array(numTokens);
 
+        const logitBiasKeys = Object.keys(logit_bias ?? {});
         for (let index = 0; index < numTokens; index++) {
-          pos2seq_id[index] = 0;
-          tokenIds[index] = parseInt(Object.keys(logit_bias ?? {})[index]);
-          tokenLogitBias[index] = logit_bias![tokenIds[index]];
+          const tokenId = parseInt(logitBiasKeys[index]);
+          tokenIds[index] = tokenId;
+          tokenLogitBias[index] = logit_bias![tokenId];
         }
 
         const pos2seqIdsArray = this.tvm
@@ -1191,11 +1174,8 @@ export class LLMChatPipeline {
       const paddedPenalties = new Float32Array(3);
       paddedPenalties.set(penalties);
 
-      for (let index = 0; index < numTokens; index++) {
-        pos2seq_id[index] = 0;
-        tokenIds[index] = appearedTokens[index];
-        tokenCnt[index] = appearedTokensFreqs[index];
-      }
+      tokenIds.set(appearedTokens);
+      tokenCnt.set(appearedTokensFreqs);
 
       const pos2seqIdsArray = this.tvm
         .empty([numTokens], "int32", this.device)

From 10ed01079bc6fd7c91d9b4d69ce7af46fc2065cf Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Thu, 28 Aug 2025 17:26:13 -0400
Subject: [PATCH 04/14] Remove unnecessary GPU-CPU copies and fix scope issues

---
 src/llm_chat.ts | 190 ++++++++++++++++++++++++------------------------
 1 file changed, 95 insertions(+), 95 deletions(-)

diff --git a/src/llm_chat.ts b/src/llm_chat.ts
index 2c7d62a6..4272cb71 100644
--- a/src/llm_chat.ts
+++ b/src/llm_chat.ts
@@ -1085,66 +1085,60 @@ export class LLMChatPipeline {
       this.tvm.endScope();
     }
 
-    // 1. Move logits to CPU
-    this.tvm.beginScope();
-    this.updateLogitsOnCPU(logitsOnGPU);
-    this.tvm.endScope();
-    await this.device.sync();
-
-    if (this.logitsOnCPU == undefined) {
-      throw Error("logits should be assigned");
-    }
+    // 1. Post process logits via logitProcessor and/or logit_bias
+    if (this.logitProcessor !== undefined) {
+      // Move logits to CPU
+      this.tvm.beginScope();
+      this.updateLogitsOnCPU(logitsOnGPU);
+      this.tvm.endScope();
+      await this.device.sync();
 
-    // 2. Post process logits via logitProcessor and/or logit_bias
-    if (this.logitProcessor !== undefined || _hasValue(logit_bias)) {
+      if (this.logitsOnCPU == undefined) {
+        throw Error("logits should be assigned");
+      }
       let logitsOnCPUArray: Float32Array = <Float32Array>(
         this.logitsOnCPU.toArray()
       );
-      const vocab_size = logitsOnCPUArray.length;
-      if (this.logitProcessor !== undefined) {
-        logitsOnCPUArray = this.logitProcessor.processLogits(logitsOnCPUArray);
+      logitsOnCPUArray = this.logitProcessor.processLogits(logitsOnCPUArray);
+      logitsOnGPU.copyFrom(logitsOnCPUArray);
+      this.logitsOnCPU.copyFrom(logitsOnCPUArray);
+    }
+
+    if (_hasValue(logit_bias)) {
+      const numTokens = Object.keys(logit_bias ?? {}).length;
+      const pos2seq_id = new Int32Array(numTokens).fill(0);
+      const tokenIds = new Int32Array(numTokens);
+      const tokenLogitBias = new Float32Array(numTokens);
+
+      const logitBiasKeys = Object.keys(logit_bias ?? {});
+      for (let index = 0; index < numTokens; index++) {
+        const tokenId = parseInt(logitBiasKeys[index]);
+        tokenIds[index] = tokenId;
+        tokenLogitBias[index] = logit_bias![tokenId];
       }
 
-      if (_hasValue(logit_bias)) {
-        this.tvm.beginScope();
-        const numTokens = Object.keys(logit_bias ?? {}).length;
-        const pos2seq_id = new Int32Array(numTokens).fill(0);
-        const tokenIds = new Int32Array(numTokens);
-        const tokenLogitBias = new Float32Array(numTokens);
-
-        const logitBiasKeys = Object.keys(logit_bias ?? {});
-        for (let index = 0; index < numTokens; index++) {
-          const tokenId = parseInt(logitBiasKeys[index]);
-          tokenIds[index] = tokenId;
-          tokenLogitBias[index] = logit_bias![tokenId];
-        }
+      this.tvm.beginScope();
 
-        const pos2seqIdsArray = this.tvm
-          .empty([numTokens], "int32", this.device)
-          .copyFrom(pos2seq_id);
+      const pos2seqIdsArray = this.tvm
+        .empty([numTokens], "int32", this.device)
+        .copyFrom(pos2seq_id);
 
-        const tokenIdsArray = this.tvm
-          .empty([numTokens], "int32", this.device)
-          .copyFrom(tokenIds);
+      const tokenIdsArray = this.tvm
+        .empty([numTokens], "int32", this.device)
+        .copyFrom(tokenIds);
 
-        const tokenLogitBiasArray = this.tvm
-          .empty([numTokens], "float32", this.device)
-          .copyFrom(tokenLogitBias);
+      const tokenLogitBiasArray = this.tvm
+        .empty([numTokens], "float32", this.device)
+        .copyFrom(tokenLogitBias);
 
-        const logitsOnGPU = this.tvm
-          .empty([1, this.fullVocabSize], "float32", this.device)
-          .copyFrom(logitsOnCPUArray);
+      this.fapplyLogitBias(
+        logitsOnGPU.view([1, this.fullVocabSize]),
+        pos2seqIdsArray,
+        tokenIdsArray,
+        tokenLogitBiasArray,
+      );
 
-        this.fapplyLogitBias(
-          logitsOnGPU,
-          pos2seqIdsArray,
-          tokenIdsArray,
-          tokenLogitBiasArray,
-        );
-        this.updateLogitsOnCPU(logitsOnGPU);
-        this.tvm.endScope();
-      }
-      await this.device.sync();
+      this.tvm.endScope();
     }
 
     // 3. Apply penalties to logits
@@ -1153,64 +1147,57 @@ export class LLMChatPipeline {
       presence_penalty != 0.0 ||
       repetition_penalty != 1.0
     ) {
-      this.tvm.beginScope();
       const appearedTokens = [...this.appearedTokensFreq.keys()];
       const appearedTokensFreqs = [...this.appearedTokensFreq.values()];
 
       const numTokens = appearedTokens.length;
 
-      const seqIdsArray = this.tvm
-        .empty([1], "int32", this.device)
-        .copyFrom([0]);
+      if (numTokens > 0) {
+        const pos2seq_id = new Int32Array(numTokens).fill(0);
+        const tokenIds = new Int32Array(numTokens).fill(0);
+        const tokenCnt = new Int32Array(numTokens).fill(0);
+        const penalties = new Float32Array([
+          presence_penalty,
+          frequency_penalty,
+          repetition_penalty,
+        ]);
 
-      const pos2seq_id = new Int32Array(numTokens).fill(0);
-      const tokenIds = new Int32Array(numTokens).fill(0);
-      const tokenCnt = new Int32Array(numTokens).fill(0);
-      const penalties = new Float32Array([
-        presence_penalty,
-        frequency_penalty,
-        repetition_penalty,
-      ]);
-      const paddedPenalties = new Float32Array(3);
-      paddedPenalties.set(penalties);
-
-      tokenIds.set(appearedTokens);
-      tokenCnt.set(appearedTokensFreqs);
+        tokenIds.set(appearedTokens);
+        tokenCnt.set(appearedTokensFreqs);
 
-      const pos2seqIdsArray = this.tvm
-        .empty([numTokens], "int32", this.device)
-        .copyFrom(pos2seq_id);
+        this.tvm.beginScope();
+        const seqIdsArray = this.tvm
+          .empty([1], "int32", this.device)
+          .copyFrom([0]);
 
-      const tokenIdsArray = this.tvm
-        .empty([numTokens], "int32", this.device)
-        .copyFrom(tokenIds);
+        const pos2seqIdsArray = this.tvm
+          .empty([numTokens], "int32", this.device)
+          .copyFrom(pos2seq_id);
 
-      const tokenCntArray = this.tvm
-        .empty([numTokens], "int32", this.device)
-        .copyFrom(tokenCnt);
+        const tokenIdsArray = this.tvm
+          .empty([numTokens], "int32", this.device)
+          .copyFrom(tokenIds);
 
-      const penaltiesArray = this.tvm
-        .empty([1, 3], "float32", this.device)
-        .copyFrom(paddedPenalties);
+        const tokenCntArray = this.tvm
+          .empty([numTokens], "int32", this.device)
+          .copyFrom(tokenCnt);
 
-      const logitsOnGPU = this.tvm
-        .empty([1, this.fullVocabSize], "float32", this.device)
-        .copyFrom(this.logitsOnCPU.toArray());
+        const penaltiesArray = this.tvm
+          .empty([1, 3], "float32", this.device)
+          .copyFrom(penalties);
 
-      if (numTokens > 0) {
         this.fapplyPenalty(
-          logitsOnGPU,
+          logitsOnGPU.view([1, this.fullVocabSize]),
           seqIdsArray,
           pos2seqIdsArray,
           tokenIdsArray,
           tokenCntArray,
           penaltiesArray,
         );
+
+        this.tvm.endScope();
       }
-      this.updateLogitsOnCPU(logitsOnGPU);
-      this.tvm.endScope();
     }
-    await this.device.sync();
 
     // 4. Sample token from logits
     // If logprobs, need the actual distribution via softmax, otherwise directly sample from logits
@@ -1230,33 +1217,46 @@ export class LLMChatPipeline {
           .empty([numSeqs], "float32", this.device)
           .copyFrom(temperatures);
 
-        const logitsOnGPU = this.tvm
-          .empty([numSeqs, 1, this.fullVocabSize], "float32", this.device)
-          .copyFrom(this.logitsOnCPU.toArray());
-
         const probs = this.fsoftmaxWithTemperature(
-          logitsOnGPU,
+          logitsOnGPU.view([numSeqs, 1, this.fullVocabSize]),
           temperaturesArray,
         );
         this.updateLogitsOnCPU(probs);
         this.tvm.endScope();
         await this.device.sync();
 
-        sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU, top_p);
+        // sampledToken = this.fsampleWithTopP(
+        //   probs.view([numSeqs, 1, this.fullVocabSize]),
+        //   top_p,
+        //   top_logprobs,
+        //   this.fullVocabSize,
+        //   this.appearedTokensFreq,
+        //   this.tokenLogprobArray,
+        // )
+
+        sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU!, top_p);
         this.tokenLogprobArray.push(
           this.getTokenLogprob(sampledToken, top_logprobs!),
         );
       } else {
-        this.tvm.applySoftmaxWithTemperature(this.logitsOnCPU, temperature);
-        sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU, top_p);
+        this.tvm.beginScope();
+        this.updateLogitsOnCPU(logitsOnGPU);
+        this.tvm.endScope();
+        await this.device.sync();
+        this.tvm.applySoftmaxWithTemperature(this.logitsOnCPU!, temperature);
+        sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU!, top_p);
         this.tokenLogprobArray.push(
           this.getTokenLogprob(sampledToken, top_logprobs!),
         );
       }
     } else {
       // temperature being 0 is allowed here, equivalent to argmax
+      this.tvm.beginScope();
+      this.updateLogitsOnCPU(logitsOnGPU);
+      this.tvm.endScope();
+      await this.device.sync();
       sampledToken = this.tvm.sampleTopPFromLogits(
-        this.logitsOnCPU,
+        this.logitsOnCPU!,
         temperature,
         top_p,
       );

From 2f21df77e936ecc598b0aca917ec0e9f04924186 Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Fri, 5 Sep 2025 19:08:19 -0400
Subject: [PATCH 05/14] Update comments and remove unnecessary control logic
 for token sampling

---
 src/llm_chat.ts | 67 +++++++++++++++++++++----------------------------
 1 file changed, 28 insertions(+), 39 deletions(-)

diff --git a/src/llm_chat.ts b/src/llm_chat.ts
index 4272cb71..66c866f6 100644
--- a/src/llm_chat.ts
+++ b/src/llm_chat.ts
@@ -1085,7 +1085,7 @@ export class LLMChatPipeline {
       this.tvm.endScope();
     }
 
-    // 1. Post process logits via logitProcessor and/or logit_bias
+    // 1. Apply logitProcessor on CPU
     if (this.logitProcessor !== undefined) {
       // Move logits to CPU
       this.tvm.beginScope();
@@ -1104,6 +1104,7 @@ export class LLMChatPipeline {
       this.logitsOnCPU.copyFrom(logitsOnCPUArray);
     }
 
+    // 2. Apply logit_bias on GPU
     if (_hasValue(logit_bias)) {
       const numTokens = Object.keys(logit_bias ?? {}).length;
       const pos2seq_id = new Int32Array(numTokens).fill(0);
@@ -1209,46 +1210,34 @@ export class LLMChatPipeline {
       const numSeqs = 1;
       const numTokens = this.appearedTokensFreq.size;
 
-      if (numTokens > 0) {
-        const temperatures = new Float32Array([temperature]);
+      const temperatures = new Float32Array([temperature]);
 
-        this.tvm.beginScope();
-        const temperaturesArray = this.tvm
-          .empty([numSeqs], "float32", this.device)
-          .copyFrom(temperatures);
+      this.tvm.beginScope();
+      const temperaturesArray = this.tvm
+        .empty([numSeqs], "float32", this.device)
+        .copyFrom(temperatures);
 
-        const probs = this.fsoftmaxWithTemperature(
-          logitsOnGPU.view([numSeqs, 1, this.fullVocabSize]),
-          temperaturesArray,
-        );
-        this.updateLogitsOnCPU(probs);
-        this.tvm.endScope();
-        await this.device.sync();
-
-        // sampledToken = this.fsampleWithTopP(
-        //   probs.view([numSeqs, 1, this.fullVocabSize]),
-        //   top_p,
-        //   top_logprobs,
-        //   this.fullVocabSize,
-        //   this.appearedTokensFreq,
-        //   this.tokenLogprobArray,
-        // )
-
-        sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU!, top_p);
-        this.tokenLogprobArray.push(
-          this.getTokenLogprob(sampledToken, top_logprobs!),
-        );
-      } else {
-        this.tvm.beginScope();
-        this.updateLogitsOnCPU(logitsOnGPU);
-        this.tvm.endScope();
-        await this.device.sync();
-        this.tvm.applySoftmaxWithTemperature(this.logitsOnCPU!, temperature);
-        sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU!, top_p);
-        this.tokenLogprobArray.push(
-          this.getTokenLogprob(sampledToken, top_logprobs!),
-        );
-      }
+      const probs = this.fsoftmaxWithTemperature(
+        logitsOnGPU.view([numSeqs, 1, this.fullVocabSize]),
+        temperaturesArray,
+      );
+      this.updateLogitsOnCPU(probs);
+      this.tvm.endScope();
+      await this.device.sync();
+
+      // sampledToken = this.fsampleWithTopP(
+      //   probs.view([numSeqs, 1, this.fullVocabSize]),
+      //   top_p,
+      //   top_logprobs,
+      //   this.fullVocabSize,
+      //   this.appearedTokensFreq,
+      //   this.tokenLogprobArray,
+      // )
+
+      sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU!, top_p);
+      this.tokenLogprobArray.push(
+        this.getTokenLogprob(sampledToken, top_logprobs!),
+      );
     } else {
       // temperature being 0 is allowed here, equivalent to argmax
       this.tvm.beginScope();

From 7975304d9ce0d3cd1bbc66f0b54d20c7cbe1cc99 Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Tue, 9 Sep 2025 17:19:38 -0400
Subject: [PATCH 06/14] Fix bug with using provided repetition penalty value
 and add support for including sampling latency breakdown in response

---
 src/config.ts                               |  3 ++-
 src/engine.ts                               | 23 +++++++++++++++++++++
 src/openai_api_protocols/chat_completion.ts | 21 ++++++++++++++++++-
 src/openai_api_protocols/completion.ts      | 18 ++++++++++++++++
 src/types.ts                                |  9 ++++++++
 5 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/config.ts b/src/config.ts
index dfeb1913..0ca51eba 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -126,7 +126,7 @@ export interface MLCEngineConfig {
  */
 export interface GenerationConfig {
   // Only used in MLC
-  repetition_penalty?: number;
+  repetition_penalty?: number | null;
   ignore_eos?: boolean;
   // Shared by MLC and OpenAI APIs
   top_p?: number | null;
@@ -143,6 +143,7 @@ export interface GenerationConfig {
   response_format?: ResponseFormat | null;
   // extra_body in ChatCompletionsRequest
   enable_thinking?: boolean | null;
+  enable_latency_breakdown?: boolean | null;
 }
 
 export function postInitAndCheckGenerationConfigValues(
diff --git a/src/engine.ts b/src/engine.ts
index fe4c427e..6609f3e5 100644
--- a/src/engine.ts
+++ b/src/engine.ts
@@ -41,6 +41,7 @@ import {
   MLCEngineInterface,
   LogitProcessor,
   LogLevel,
+  LatencyBreakdown,
 } from "./types";
 import {
   compareConversationObject,
@@ -694,12 +695,18 @@ export class MLCEngine implements MLCEngineInterface {
       const decode_time = pipeline.getCurRoundDecodingTotalTime();
       const grammar_per_token_s =
         pipeline.getCurRoundGrammarPerTokenTotalTime();
+      const latencyBreakdown: LatencyBreakdown =
+        pipeline.getCurRoundLatencyBreakdown();
+
       const defaultExtra = {
         e2e_latency_s: (Date.now() - timeReceived) / 1000,
         prefill_tokens_per_s: prefill_tokens_per_s,
         decode_tokens_per_s: decode_tokens_per_s,
         time_to_first_token_s: prefill_time,
         time_per_output_token_s: decode_time / completion_tokens,
+        latencyBreakdown: request.extra_body?.enable_latency_breakdown
+          ? latencyBreakdown
+          : undefined,
       };
       const usage: CompletionUsage = {
         completion_tokens: completion_tokens,
@@ -783,6 +790,7 @@ export class MLCEngine implements MLCEngineInterface {
     const genConfig: GenerationConfig = {
       frequency_penalty: request.frequency_penalty,
       presence_penalty: request.presence_penalty,
+      repetition_penalty: request.repetition_penalty,
       max_tokens: request.max_tokens,
       stop: request.stop,
       top_p: request.top_p,
@@ -793,6 +801,7 @@ export class MLCEngine implements MLCEngineInterface {
       response_format: request.response_format,
       ignore_eos: request.ignore_eos,
       enable_thinking: request.extra_body?.enable_thinking,
+      enable_latency_breakdown: request.extra_body?.enable_latency_breakdown,
     };
 
     // 0.5 Block wait until this pipeline finishes all previous requests
@@ -890,12 +899,19 @@ export class MLCEngine implements MLCEngineInterface {
         "response_format" in request &&
         (request.response_format?.type === "grammar" ||
           request.response_format?.type === "json_object");
+
+      const latencyBreakdown: LatencyBreakdown =
+        selectedPipeline.getCurRoundLatencyBreakdown();
+
       const defaultExtra = {
         e2e_latency_s: (Date.now() - timeReceived) / 1000,
         prefill_tokens_per_s: prompt_tokens / prefill_time,
         decode_tokens_per_s: completion_tokens / decode_time,
         time_to_first_token_s: prefill_time,
         time_per_output_token_s: decode_time / completion_tokens,
+        latencyBreakdown: request.extra_body?.enable_latency_breakdown
+          ? latencyBreakdown
+          : undefined,
       };
       const response: ChatCompletion = {
         id: crypto.randomUUID(),
@@ -958,6 +974,7 @@ export class MLCEngine implements MLCEngineInterface {
     const genConfig: GenerationConfig = {
       frequency_penalty: request.frequency_penalty,
       presence_penalty: request.presence_penalty,
+      repetition_penalty: request.repetition_penalty,
       max_tokens: request.max_tokens,
       stop: request.stop,
       top_p: request.top_p,
@@ -1030,6 +1047,9 @@ export class MLCEngine implements MLCEngineInterface {
         decode_time += selectedPipeline.getCurRoundDecodingTotalTime();
       }
 
+      const latencyBreakdown: LatencyBreakdown =
+        selectedPipeline.getCurRoundLatencyBreakdown();
+
       const response: Completion = {
         id: crypto.randomUUID(),
         choices: choices,
@@ -1046,6 +1066,9 @@ export class MLCEngine implements MLCEngineInterface {
             decode_tokens_per_s: completion_tokens / decode_time,
             time_to_first_token_s: prefill_time,
             time_per_output_token_s: decode_time / completion_tokens,
+            latencyBreakdown: request.extra_body?.enable_latency_breakdown
+              ? latencyBreakdown
+              : undefined,
           },
         } as CompletionUsage,
       };
diff --git a/src/openai_api_protocols/chat_completion.ts b/src/openai_api_protocols/chat_completion.ts
index 5a5229ed..e1ee031a 100644
--- a/src/openai_api_protocols/chat_completion.ts
+++ b/src/openai_api_protocols/chat_completion.ts
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-import { MLCEngineInterface } from "../types";
+import { MLCEngineInterface, LatencyBreakdown } from "../types";
 import {
   functionCallingModelIds,
   MessagePlaceholders,
@@ -125,6 +125,13 @@ export interface ChatCompletionRequestBase {
    */
   presence_penalty?: number | null;
 
+  /**
+   * Number greater than or equal to 1.0. Values greater than 1.0 discourage
+   * the model from repeating tokens that have already been generated. Repetition
+   * penalty is like presence penalty but is multiplicative.
+   */
+  repetition_penalty?: number | null;
+
   /**
    * The maximum number of [tokens](/tokenizer) that can be generated in the chat
    * completion.
@@ -268,6 +275,12 @@ export interface ChatCompletionRequestBase {
      * @note Currently only allowed to be used for Qwen3 models, though not explicitly checked.
      */
     enable_thinking?: boolean | null;
+
+    /**
+     * If set to true, the response will include a breakdown of the time spent in various
+     * stages of token sampling.
+     */
+    enable_latency_breakdown?: boolean | null;
   };
 }
 
@@ -980,6 +993,12 @@ export interface CompletionUsage {
      * structured output. If n > 1, it is the average over all choices.
      */
     grammar_per_token_s?: number;
+
+    /**
+     * If `enable_latency_breakdown` is set to true in the request, this field will be
+     * present and contain a breakdown of the time spent in various stages of token sampling.
+     */
+    latencyBreakdown?: LatencyBreakdown;
   };
 }
 
diff --git a/src/openai_api_protocols/completion.ts b/src/openai_api_protocols/completion.ts
index fb6aa458..54fcc34f 100644
--- a/src/openai_api_protocols/completion.ts
+++ b/src/openai_api_protocols/completion.ts
@@ -137,6 +137,13 @@ export interface CompletionCreateParamsBase {
    */
   presence_penalty?: number | null;
 
+  /**
+   * Number greater than or equal to 1.0. Values greater than 1.0 discourage
+   * the model from repeating tokens that have already been generated. Repetition
+   * penalty is like presence penalty but is multiplicative.
+   */
+  repetition_penalty?: number | null;
+
   /**
    * If specified, our system will make a best effort to sample deterministically,
    * such that repeated requests with the same `seed` and parameters should return
@@ -225,6 +232,17 @@ export interface CompletionCreateParamsBase {
    * @note This field is not supported.
    */
   best_of?: number | null;
+
+  /**
+   * Fields specific to WebLLM, not present in OpenAI.
+   */
+  extra_body?: {
+    /**
+     * If set to true, the response will include a breakdown of the time spent in various
+     * stages of token sampling.
+     */
+    enable_latency_breakdown?: boolean | null;
+  };
 }
 
 export type CompletionCreateParams =
diff --git a/src/types.ts b/src/types.ts
index 4d4522c0..ed79af57 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -251,3 +251,12 @@ export const LOG_LEVELS = {
   SILENT: 5,
 };
 export type LogLevel = keyof typeof LOG_LEVELS;
+
+export type LatencyBreakdown = {
+  logitProcessorTime: number[];
+  logitBiasTime: number[];
+  penaltyTime: number[];
+  sampleTime: number[];
+  totalTime: number[];
+  grammarBitmaskTime: number[];
+};

From b78e4080dc6410c57ad278813324cf7b181f3c02 Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Tue, 9 Sep 2025 17:21:57 -0400
Subject: [PATCH 07/14] Update comments and add timing for sampling steps

---
 src/llm_chat.ts | 93 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 82 insertions(+), 11 deletions(-)

diff --git a/src/llm_chat.ts b/src/llm_chat.ts
index 66c866f6..7266b5e4 100644
--- a/src/llm_chat.ts
+++ b/src/llm_chat.ts
@@ -6,7 +6,7 @@ import log from "loglevel";
 import { Tokenizer } from "@mlc-ai/web-tokenizers";
 import { ChatConfig, GenerationConfig, Role } from "./config";
 import { getConversation, Conversation } from "./conversation";
-import { LogitProcessor } from "./types";
+import { LogitProcessor, LatencyBreakdown } from "./types";
 import {
   getChunkedPrefillInputData,
   getImageDataFromURL,
@@ -102,6 +102,16 @@ export class LLMChatPipeline {
   private curRoundDecodingTotalTime = 0;
   private curRoundPrefillTotalTime = 0;
 
+  // additional stats, reset at every prefillStep()
+  public curRoundLatencyBreakdown: LatencyBreakdown = {
+    logitProcessorTime: [],
+    logitBiasTime: [],
+    penaltyTime: [],
+    sampleTime: [],
+    totalTime: [],
+    grammarBitmaskTime: [],
+  };
+
   // LogitProcessor
   private logitProcessor?: LogitProcessor = undefined;
 
@@ -434,6 +444,13 @@ export class LLMChatPipeline {
     return this.curRoundGrammarPerTokenTotalTime;
   }
 
+  /**
+   * @returns the breakdown of latencies for sampling each token for a single request.
+   */
+  getCurRoundLatencyBreakdown(): LatencyBreakdown {
+    return this.curRoundLatencyBreakdown;
+  }
+
   /**
    * @returns Runtime stats information.
    */
@@ -527,6 +544,16 @@ export class LLMChatPipeline {
     this.curRoundDecodingTotalTime = 0;
     this.curRoundGrammarInitTotalTime = 0;
     this.curRoundGrammarPerTokenTotalTime = 0;
+
+    this.curRoundLatencyBreakdown = {
+      logitProcessorTime: [],
+      logitBiasTime: [],
+      penaltyTime: [],
+      sampleTime: [],
+      totalTime: [],
+      grammarBitmaskTime: [],
+    };
+
     this.stopTriggered = false;
     const conversation = this.conversation;
 
@@ -1049,11 +1076,15 @@ export class LLMChatPipeline {
       throw new RangeError("presence_penalty", -2.0, 2.0);
     }
 
+    const outputTokenBegin = performance.now();
+
     // 0. Update logitsOnGPU with on-GPU grammar bitmasking
     if (
       response_format?.type === "json_object" ||
       response_format?.type === "grammar"
     ) {
+      const grammarBitmaskBegin = performance.now();
+
       this.tvm.beginScope();
       if (this.grammarMatcher === undefined) {
         throw Error("Expect grammar matcher to be initialized.");
@@ -1083,6 +1114,15 @@ export class LLMChatPipeline {
         bitMaskOnGPU,
       );
       this.tvm.endScope();
+
+      if (genConfig?.enable_latency_breakdown) {
+        const grammarBitmaskEnd = performance.now();
+        const grammarBitmaskTimeSpent =
+          (grammarBitmaskEnd - grammarBitmaskBegin) / 1e3;
+        this.curRoundLatencyBreakdown.grammarBitmaskTime.push(
+          grammarBitmaskTimeSpent,
+        );
+      }
     }
 
     // 1. Apply logitProcessor on CPU
@@ -1093,6 +1133,8 @@ export class LLMChatPipeline {
       this.tvm.endScope();
       await this.device.sync();
 
+      const logitProcessorBegin = performance.now();
+
       if (this.logitsOnCPU == undefined) {
         throw Error("logits should be assigned");
       }
@@ -1102,10 +1144,21 @@ export class LLMChatPipeline {
       logitsOnCPUArray = this.logitProcessor.processLogits(logitsOnCPUArray);
       logitsOnGPU.copyFrom(logitsOnCPUArray);
       this.logitsOnCPU.copyFrom(logitsOnCPUArray);
+
+      if (genConfig?.enable_latency_breakdown) {
+        const logitProcessorEnd = performance.now();
+        const logitProcessorTimeSpent =
+          (logitProcessorEnd - logitProcessorBegin) / 1e3;
+        this.curRoundLatencyBreakdown.logitProcessorTime.push(
+          logitProcessorTimeSpent,
+        );
+      }
     }
 
     // 2. Apply logit_bias on GPU
     if (_hasValue(logit_bias)) {
+      const logitBiasBegin = performance.now();
+
       const numTokens = Object.keys(logit_bias ?? {}).length;
       const pos2seq_id = new Int32Array(numTokens).fill(0);
       const tokenIds = new Int32Array(numTokens);
@@ -1140,9 +1193,15 @@ export class LLMChatPipeline {
       );
 
       this.tvm.endScope();
+
+      if (genConfig?.enable_latency_breakdown) {
+        const logitBiasEnd = performance.now();
+        const logitBiasTimeSpent = (logitBiasEnd - logitBiasBegin) / 1e3;
+        this.curRoundLatencyBreakdown.logitBiasTime.push(logitBiasTimeSpent);
+      }
     }
 
-    // 3. Apply penalties to logits
+    // 3. Apply penalties to logits on GPU
     if (
       frequency_penalty != 0.0 ||
       presence_penalty != 0.0 ||
@@ -1154,6 +1213,8 @@ export class LLMChatPipeline {
       const numTokens = appearedTokens.length;
 
       if (numTokens > 0) {
+        const penaltyBegin = performance.now();
+
         const pos2seq_id = new Int32Array(numTokens).fill(0);
         const tokenIds = new Int32Array(numTokens).fill(0);
         const tokenCnt = new Int32Array(numTokens).fill(0);
@@ -1197,11 +1258,18 @@ export class LLMChatPipeline {
         );
 
         this.tvm.endScope();
+
+        if (genConfig?.enable_latency_breakdown) {
+          const penaltyEnd = performance.now();
+          const penaltyTimeSpent = (penaltyEnd - penaltyBegin) / 1e3;
+          this.curRoundLatencyBreakdown.penaltyTime.push(penaltyTimeSpent);
+        }
       }
     }
 
     // 4. Sample token from logits
     // If logprobs, need the actual distribution via softmax, otherwise directly sample from logits
+    const sampleBegin = performance.now();
     let sampledToken: number;
     if (logprobs) {
       // Inplace transform logitsOnCPU to a distribution
@@ -1225,15 +1293,6 @@ export class LLMChatPipeline {
       this.tvm.endScope();
       await this.device.sync();
 
-      // sampledToken = this.fsampleWithTopP(
-      //   probs.view([numSeqs, 1, this.fullVocabSize]),
-      //   top_p,
-      //   top_logprobs,
-      //   this.fullVocabSize,
-      //   this.appearedTokensFreq,
-      //   this.tokenLogprobArray,
-      // )
-
       sampledToken = this.tvm.sampleTopPFromProb(this.logitsOnCPU!, top_p);
       this.tokenLogprobArray.push(
         this.getTokenLogprob(sampledToken, top_logprobs!),
@@ -1251,6 +1310,12 @@ export class LLMChatPipeline {
       );
     }
 
+    if (genConfig?.enable_latency_breakdown) {
+      const sampleEnd = performance.now();
+      const sampleTimeSpent = (sampleEnd - sampleBegin) / 1e3;
+      this.curRoundLatencyBreakdown.sampleTime.push(sampleTimeSpent);
+    }
+
     // 5. Update logit processor
     this.logitProcessor?.processSampledToken(sampledToken);
 
@@ -1271,6 +1336,12 @@ export class LLMChatPipeline {
       }
     }
 
+    if (genConfig?.enable_latency_breakdown) {
+      const outputTokenEnd = performance.now();
+      const outputTokenTimeSpent = (outputTokenEnd - outputTokenBegin) / 1e3;
+      this.curRoundLatencyBreakdown.totalTime.push(outputTokenTimeSpent);
+    }
+
     return sampledToken;
   }
 

From 8cc787e13826cea7aae0a1afb279a0222abe371a Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Tue, 9 Sep 2025 18:04:41 -0400
Subject: [PATCH 08/14] Add sanity checks and latency breakdown examples

---
 .../get-started-latency-breakdown/README.md   |  15 ++
 .../package.json                              |  20 ++
 .../src/get_started_latency_breakdown.html    |  23 +++
 .../src/get_started_latency_breakdown.ts      | 164 +++++++++++++++
 examples/sanity-checks/README.md              |  14 ++
 examples/sanity-checks/package.json           |  20 ++
 examples/sanity-checks/src/sanity_checks.html |  35 ++++
 examples/sanity-checks/src/sanity_checks.ts   | 187 ++++++++++++++++++
 8 files changed, 478 insertions(+)
 create mode 100644 examples/get-started-latency-breakdown/README.md
 create mode 100644 examples/get-started-latency-breakdown/package.json
 create mode 100644 examples/get-started-latency-breakdown/src/get_started_latency_breakdown.html
 create mode 100644 examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts
 create mode 100644 examples/sanity-checks/README.md
 create mode 100644 examples/sanity-checks/package.json
 create mode 100644 examples/sanity-checks/src/sanity_checks.html
 create mode 100644 examples/sanity-checks/src/sanity_checks.ts

diff --git a/examples/get-started-latency-breakdown/README.md b/examples/get-started-latency-breakdown/README.md
new file mode 100644
index 00000000..0a74ea26
--- /dev/null
+++ b/examples/get-started-latency-breakdown/README.md
@@ -0,0 +1,15 @@
+# WebLLM Get Started App
+
+This folder provides a minimum demo to show WebLLM API in a webapp setting with 
+collection of latency statistics for individual token sampling steps.
+To try it out, you can do the following steps under this folder
+
+```bash
+npm install
+npm start
+```
+
+Note if you would like to hack WebLLM core package.
+You can change web-llm dependencies as `"file:../.."`, and follow the build from source
+instruction in the project to build webllm locally. This option is only recommended
+if you would like to hack WebLLM core package.
diff --git a/examples/get-started-latency-breakdown/package.json b/examples/get-started-latency-breakdown/package.json
new file mode 100644
index 00000000..0b321e9d
--- /dev/null
+++ b/examples/get-started-latency-breakdown/package.json
@@ -0,0 +1,20 @@
+{
+  "name": "get-started-latency-breakdown",
+  "version": "0.1.0",
+  "private": true,
+  "scripts": {
+    "start": "parcel src/get_started_latency_breakdown.html  --port 8888",
+    "build": "parcel build src/get_started_latency_breakdown.html --dist-dir lib"
+  },
+  "devDependencies": {
+    "buffer": "^5.7.1",
+    "parcel": "^2.8.3",
+    "process": "^0.11.10",
+    "tslib": "^2.3.1",
+    "typescript": "^4.9.5",
+    "url": "^0.11.3"
+  },
+  "dependencies": {
+    "@mlc-ai/web-llm": "^0.2.79"
+  }
+}
diff --git a/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.html b/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.html
new file mode 100644
index 00000000..18298616
--- /dev/null
+++ b/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.html
@@ -0,0 +1,23 @@
+<!doctype html>
+<html>
+  <script>
+    webLLMGlobal = {};
+  </script>
+  <body>
+    <h2>WebLLM Test Page</h2>
+    Open console to see output
+    <br />
+    <br />
+    <label id="init-label"> </label>
+
+    <h3>Prompt</h3>
+    <label id="prompt-label"> </label>
+
+    <h3>Response</h3>
+    <label id="generate-label"> </label>
+    <br />
+    <label id="stats-label"> </label>
+
+    <script type="module" src="./get_started_latency_breakdown.ts"></script>
+  </body>
+</html>
diff --git a/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts b/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts
new file mode 100644
index 00000000..26d9565e
--- /dev/null
+++ b/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts
@@ -0,0 +1,164 @@
+import * as webllm from "@mlc-ai/web-llm";
+
+function setLabel(id: string, text: string) {
+  const label = document.getElementById(id);
+  if (label == null) {
+    throw Error("Cannot find label " + id);
+  }
+  label.innerText = text;
+}
+
+type LatencyBreakdown = {
+  logitProcessorTime: number[];
+  logitBiasTime: number[];
+  penaltyTime: number[];
+  sampleTime: number[];
+  totalTime: number[];
+  grammarBitmaskTime: number[];
+};
+function computeStats(
+  latency_breakdown: LatencyBreakdown,
+): Record<string, any> {
+  function _computeStats(arr: number[]) {
+    if (!arr.length) return undefined;
+    const sorted = [...arr].sort((a, b) => a - b);
+    const sum = arr.reduce((a, b) => a + b, 0);
+    const avg = sum / arr.length;
+    const min = sorted[0];
+    const max = sorted[sorted.length - 1];
+    const p99 = sorted[Math.floor(0.99 * (sorted.length - 1))];
+    return { avg, min, max, p99 };
+  }
+
+  const latencyStats: Record<string, any> = {};
+  for (const key of Object.keys(latency_breakdown)) {
+    const arr = (latency_breakdown as any)[key];
+    if (Array.isArray(arr) && arr.length > 0) {
+      latencyStats[key] = _computeStats(arr);
+    }
+  }
+  return latencyStats;
+}
+
+async function main() {
+  const initProgressCallback = (report: webllm.InitProgressReport) => {
+    setLabel("init-label", report.text);
+  };
+  // Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`
+  // const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
+  const selectedModel = "Qwen3-0.6B-q0f32-MLC";
+  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
+    selectedModel,
+    {
+      initProgressCallback: initProgressCallback,
+      logLevel: "INFO", // specify the log level
+    },
+    // customize kv cache, use either context_window_size or sliding_window_size (with attention sink)
+    {
+      context_window_size: 2048,
+      // sliding_window_size: 1024,
+      // attention_sink_size: 4,
+    },
+  );
+
+  // Option 2: Specify your own model other than the prebuilt ones
+  // const appConfig: webllm.AppConfig = {
+  //   model_list: [
+  //     {
+  //       model: "https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC",
+  //       model_id: "Llama-3.1-8B-Instruct-q4f32_1-MLC",
+  //       model_lib:
+  //         webllm.modelLibURLPrefix +
+  //         webllm.modelVersion +
+  //         "/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm",
+  //       overrides: {
+  //         context_window_size: 2048,
+  //       },
+  //     },
+  //   ],
+  // };
+  // const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
+  //   selectedModel,
+  //   { appConfig: appConfig, initProgressCallback: initProgressCallback },
+  // );
+
+  // Option 3: Instantiate MLCEngine() and call reload() separately
+  // const engine: webllm.MLCEngineInterface = new webllm.MLCEngine({
+  //   appConfig: appConfig, // if do not specify, we use webllm.prebuiltAppConfig
+  //   initProgressCallback: initProgressCallback,
+  // });
+  // await engine.reload(selectedModel);
+
+  const latencyBreakdown: LatencyBreakdown = {
+    logitProcessorTime: [],
+    logitBiasTime: [],
+    penaltyTime: [],
+    sampleTime: [],
+    totalTime: [],
+    grammarBitmaskTime: [],
+  };
+  // want decode_tokens_per_s, e2e_latency_s, time_per_output_token_s, completion_tokens
+  const decodeTokensPerS: number[] = [];
+  const completionTokens: number[] = [];
+  const e2eLatencyS: number[] = [];
+  const timePerOutputTokenS: number[] = [];
+
+  const numTrials = 20;
+  for (let i = 0; i < numTrials; i++) {
+    console.log(`Trial ${i + 1} / ${numTrials}`);
+    const reply0 = await engine.chat.completions.create({
+      messages: [{ role: "user", content: "List twenty US states." }],
+      // below configurations are all optional
+      n: 1,
+      temperature: 0,
+      max_tokens: 2048,
+      // 46510 and 7188 are "California", and 8421 and 51325 are "Texas" in Llama-3.1-8B-Instruct
+      // So we would have a higher chance of seeing the latter two, but never the first in the answer
+      // logit_bias: {
+      //   "46510": -100,
+      //   "7188": -100,
+      //   "8421": 5,
+      //   "41325": 5,
+      // },
+      top_p: 0.8,
+      logprobs: true,
+      top_logprobs: 2,
+      frequency_penalty: 1.2,
+      presence_penalty: 1.0,
+      repetition_penalty: 1.1,
+    });
+
+    const logitProcessorTime =
+      reply0.usage?.extra.latencyBreakdown?.logitProcessorTime;
+    const logitBiasTime = reply0.usage?.extra.latencyBreakdown?.logitBiasTime;
+    const penaltyTime = reply0.usage?.extra.latencyBreakdown?.penaltyTime;
+    const sampleTime = reply0.usage?.extra.latencyBreakdown?.sampleTime;
+    const totalTime = reply0.usage?.extra.latencyBreakdown?.totalTime;
+    const grammarBitmaskTime =
+      reply0.usage?.extra.latencyBreakdown?.grammarBitmaskTime;
+
+    latencyBreakdown.logitProcessorTime.push(...(logitProcessorTime || []));
+    latencyBreakdown.logitBiasTime.push(...(logitBiasTime || []));
+    latencyBreakdown.penaltyTime.push(...(penaltyTime || []));
+    latencyBreakdown.sampleTime.push(...(sampleTime || []));
+    latencyBreakdown.totalTime.push(...(totalTime || []));
+    latencyBreakdown.grammarBitmaskTime.push(...(grammarBitmaskTime || []));
+
+    decodeTokensPerS.push(reply0.usage?.extra.decode_tokens_per_s || 0);
+    e2eLatencyS.push(reply0.usage?.extra.e2e_latency_s || 0);
+    timePerOutputTokenS.push(reply0.usage?.extra.time_per_output_token_s || 0);
+    completionTokens.push(reply0.usage?.completion_tokens || 0);
+  }
+
+  const latencyStats: { [key: string]: number } =
+    computeStats(latencyBreakdown);
+  console.log("Latency stats: ", latencyStats);
+  console.log("Decode tokens per second: ", decodeTokensPerS);
+  console.log("Completion tokens: ", completionTokens);
+  console.log("E2E latency (s): ", e2eLatencyS);
+  console.log("Time per output token (s): ", timePerOutputTokenS);
+
+  // To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)`
+}
+
+main();
diff --git a/examples/sanity-checks/README.md b/examples/sanity-checks/README.md
new file mode 100644
index 00000000..8b65a916
--- /dev/null
+++ b/examples/sanity-checks/README.md
@@ -0,0 +1,14 @@
+# Sanity Checks for Generated Output
+
+This folder provides simple sanity checks on the output generated
+using WebLLM. To try it out, you can do the following steps under this folder
+
+```bash
+npm install
+npm start
+```
+
+Note if you would like to hack WebLLM core package.
+You can change web-llm dependencies as `"file:../.."`, and follow the build from source
+instruction in the project to build webllm locally. This option is only recommended
+if you would like to hack WebLLM core package.
diff --git a/examples/sanity-checks/package.json b/examples/sanity-checks/package.json
new file mode 100644
index 00000000..b40e6464
--- /dev/null
+++ b/examples/sanity-checks/package.json
@@ -0,0 +1,20 @@
+{
+  "name": "sanity-checks",
+  "version": "0.1.0",
+  "private": true,
+  "scripts": {
+    "start": "parcel src/sanity_checks.html --port 8889",
+    "build": "parcel build src/sanity_checks.html --dist-dir lib"
+  },
+  "devDependencies": {
+    "buffer": "^5.7.1",
+    "parcel": "^2.8.3",
+    "process": "^0.11.10",
+    "tslib": "^2.3.1",
+    "typescript": "^4.9.5",
+    "url": "^0.11.3"
+  },
+  "dependencies": {
+    "@mlc-ai/web-llm": "^0.2.79"
+  }
+}
diff --git a/examples/sanity-checks/src/sanity_checks.html b/examples/sanity-checks/src/sanity_checks.html
new file mode 100644
index 00000000..bfbfc869
--- /dev/null
+++ b/examples/sanity-checks/src/sanity_checks.html
@@ -0,0 +1,35 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>GPU sampleTokenFromLogits Tests</title>
+    <style>
+        body { font-family: Arial, sans-serif; margin: 2em; }
+        .label { margin: 0.5em 0; font-weight: bold; }
+        .result { margin: 0.5em 0 1.5em 0; padding: 0.5em; background: #f4f4f4; border-radius: 4px; }
+        button { padding: 0.5em 1em; font-size: 1em; }
+    </style>
+</head>
+<body>
+    <h1>GPU sampleTokenFromLogits Tests</h1>
+    <button id="run-tests">Re-run All Tests</button>
+    <div class="label">Overall:</div>
+    <div id="gpu-test-label" class="result">Not started.</div>
+    <div class="label">Logit Processor:</div>
+    <div id="logit-processor-label" class="result"></div>
+    <div class="label">Logit Bias:</div>
+    <div id="logit-bias-label" class="result"></div>
+    <div class="label">Penalties:</div>
+    <div id="penalty-label" class="result"></div>
+    <div class="label">Logprobs:</div>
+    <div id="logprobs-label" class="result"></div>
+    <script type="module">
+        import './sanity_checks.ts';
+        document.getElementById('run-tests').onclick = () => {
+            // Reload the module to rerun tests
+            window.location.reload();
+        };
+    </script>
+</body>
+</html>
diff --git a/examples/sanity-checks/src/sanity_checks.ts b/examples/sanity-checks/src/sanity_checks.ts
new file mode 100644
index 00000000..b0f51d1b
--- /dev/null
+++ b/examples/sanity-checks/src/sanity_checks.ts
@@ -0,0 +1,187 @@
+import * as webllm from "@mlc-ai/web-llm";
+
+function setLabel(id: string, text: string) {
+  const label = document.getElementById(id);
+  if (label == null) return;
+  label.innerText = text;
+}
+
+async function createEngine(
+  modelId: string,
+  appConfig: webllm.AppConfig,
+  logitProcessorRegistry?: Map<string, webllm.LogitProcessor>,
+) {
+  return await webllm.CreateMLCEngine(modelId, {
+    appConfig,
+    logLevel: "ERROR",
+    logitProcessorRegistry,
+  });
+}
+
+async function deleteModel(modelId: string, appConfig: webllm.AppConfig) {
+  await webllm.deleteModelAllInfoInCache(modelId, appConfig);
+}
+
+async function testLogitProcessor(
+  modelId: string,
+  appConfig: webllm.AppConfig,
+) {
+  // Set up a logit processor that sets logits[0] = 100.0, rest -100.0
+  const logitProcessor = {
+    processLogits: (logits: Float32Array) => {
+      logits.fill(-100.0);
+      logits[0] = 100.0;
+      return logits;
+    },
+    processSampledToken: () => {},
+    resetState: () => {},
+  };
+  const logitProcessorRegistry: Map<string, webllm.LogitProcessor> = new Map();
+  logitProcessorRegistry.set(modelId, logitProcessor);
+  const engine: webllm.MLCEngineInterface = await createEngine(
+    modelId,
+    appConfig,
+    logitProcessorRegistry,
+  );
+
+  const prompt = "Test logit processor.";
+  const reply: webllm.ChatCompletion = await engine.chat.completions.create({
+    messages: [{ role: "user", content: prompt }],
+    temperature: 1.0,
+    max_tokens: 20,
+    logprobs: true,
+    top_logprobs: 1,
+  });
+  const logprobs = reply.choices[0]?.logprobs;
+  const logprobsAllZero = !!(
+    logprobs &&
+    Array.isArray(logprobs.content) &&
+    logprobs.content.every(
+      (lp: webllm.ChatCompletionTokenLogprob) =>
+        lp.top_logprobs[0].logprob === 0,
+    )
+  );
+
+  console.log(`[LogitProcessor] Logprobs all zero: ${logprobsAllZero}`);
+  setLabel("logit-processor-label", `Logprobs all zero: ${logprobsAllZero}`);
+  await deleteModel(modelId, appConfig);
+  return logprobsAllZero;
+}
+
+async function testLogitBias(modelId: string, appConfig: webllm.AppConfig) {
+  // Set logit_bias to strongly favor token 0
+  const prompt = "Test logit bias.";
+  // const t0 = performance.now();
+  const engine: webllm.MLCEngineInterface = await createEngine(
+    modelId,
+    appConfig,
+  );
+  const reply = await engine.chat.completions.create({
+    messages: [{ role: "user", content: prompt }],
+    temperature: 1.0,
+    max_tokens: 20,
+    logprobs: true,
+    top_logprobs: 1,
+    logit_bias: { "0": 100.0 },
+  });
+  const logprobs = reply.choices[0]?.logprobs;
+  const logprobsAllZero = !!(
+    logprobs &&
+    Array.isArray(logprobs.content) &&
+    logprobs.content.every(
+      (lp: webllm.ChatCompletionTokenLogprob) =>
+        lp.top_logprobs[0].logprob === 0,
+    )
+  );
+
+  console.log(`[LogitBias] Logprobs all zero: ${logprobsAllZero}`);
+  setLabel("logit-bias-label", `Logprobs all zero: ${logprobsAllZero}`);
+  await deleteModel(modelId, appConfig);
+  return logprobsAllZero;
+}
+
+async function testPenalties(modelId: string, appConfig: webllm.AppConfig) {
+  const prompt = "Test presence and frequency penalties.";
+  const engine: webllm.MLCEngineInterface = await createEngine(
+    modelId,
+    appConfig,
+  );
+  const reply = await engine.chat.completions.create({
+    messages: [{ role: "user", content: prompt }],
+    temperature: 1.0,
+    max_tokens: 256,
+    presence_penalty: 2.0,
+    frequency_penalty: 2.0,
+    logit_bias: { "0": 100.0 },
+    logprobs: true,
+  });
+  const logprobs = reply.choices[0]?.logprobs;
+  const logprobsNotAllZero = !logprobs?.content?.every(
+    (lp: webllm.ChatCompletionTokenLogprob) => lp.logprob === 0,
+  );
+  console.log(`[Penalties] Logprobs not all zero: ${logprobsNotAllZero}`);
+  setLabel("penalty-label", `Logprobs not all zero: ${logprobsNotAllZero}`);
+  await deleteModel(modelId, appConfig);
+  return logprobsNotAllZero;
+}
+
+async function testLogprobs(modelId: string, appConfig: webllm.AppConfig) {
+  // Test logprobs: check that logprobs are returned and sum to ~1 after exp
+  const prompt = "Test logprobs.";
+  const t0 = performance.now();
+  const engine: webllm.MLCEngineInterface = await createEngine(
+    modelId,
+    appConfig,
+  );
+  const reply = await engine.chat.completions.create({
+    messages: [{ role: "user", content: prompt }],
+    temperature: 1.0,
+    max_tokens: 20,
+    logprobs: true,
+    top_logprobs: 5,
+  });
+  const t1 = performance.now();
+  const logprobs = reply.choices[0]?.logprobs;
+
+  let logprobsAllCloseTo1 = true;
+  for (const lp of logprobs?.content || []) {
+    const expSum = lp.top_logprobs?.reduce(
+      (acc: number, val: webllm.TopLogprob) => acc + Math.exp(val.logprob),
+      0,
+    );
+    logprobsAllCloseTo1 &&= Math.abs(expSum - 1.0) < 0.1;
+  }
+  console.log(`[Logprobs] Logprobs all close to 1: ${logprobsAllCloseTo1}`);
+  setLabel("logprobs-label", `Logprobs all close to 1: ${logprobsAllCloseTo1}`);
+  await deleteModel(modelId, appConfig);
+  return logprobsAllCloseTo1;
+}
+
+async function main() {
+  const modelId = "Qwen3-0.6B-q0f32-MLC";
+  const appConfig = webllm.prebuiltAppConfig;
+  appConfig.useIndexedDBCache = true;
+  setLabel("gpu-test-label", "Running tests...");
+  let passed = 0,
+    total = 0;
+
+  if (await testLogitProcessor(modelId, appConfig)) passed++;
+  total++;
+  if (await testLogitBias(modelId, appConfig)) passed++;
+  total++;
+  if (await testPenalties(modelId, appConfig)) passed++;
+  total++;
+  if (await testLogprobs(modelId, appConfig)) passed++;
+  total++;
+
+  setLabel(
+    "gpu-test-label",
+    `GPU sampleTokenFromLogits tests: ${passed}/${total} passed.`,
+  );
+  setLabel(
+    "gpu-test-label",
+    `Tests complete. Model deleted. ${passed}/${total} passed.`,
+  );
+}
+
+main();

From d7147365c0c76e0d9c44d01c1ee7794f3388aec2 Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Tue, 9 Sep 2025 23:57:17 -0400
Subject: [PATCH 09/14] Update documentation for repetition_penalty

---
 src/openai_api_protocols/chat_completion.ts | 6 +++---
 src/openai_api_protocols/completion.ts      | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/openai_api_protocols/chat_completion.ts b/src/openai_api_protocols/chat_completion.ts
index e1ee031a..e50352d6 100644
--- a/src/openai_api_protocols/chat_completion.ts
+++ b/src/openai_api_protocols/chat_completion.ts
@@ -126,9 +126,9 @@ export interface ChatCompletionRequestBase {
   presence_penalty?: number | null;
 
   /**
-   * Number greater than or equal to 1.0. Values greater than 1.0 discourage
-   * the model from repeating tokens that have already been generated. Repetition
-   * penalty is like presence penalty but is multiplicative.
+   * Penalizes new tokens based on whether they appear in the prompt and the
+   * generated text so far. Values greater than 1.0 encourage the model to use new
+   * tokens, while values less than 1.0 encourage the model to repeat tokens.
    */
   repetition_penalty?: number | null;
 
diff --git a/src/openai_api_protocols/completion.ts b/src/openai_api_protocols/completion.ts
index 54fcc34f..0534fe93 100644
--- a/src/openai_api_protocols/completion.ts
+++ b/src/openai_api_protocols/completion.ts
@@ -138,9 +138,9 @@ export interface CompletionCreateParamsBase {
   presence_penalty?: number | null;
 
   /**
-   * Number greater than or equal to 1.0. Values greater than 1.0 discourage
-   * the model from repeating tokens that have already been generated. Repetition
-   * penalty is like presence penalty but is multiplicative.
+   * Penalizes new tokens based on whether they appear in the prompt and the
+   * generated text so far. Values greater than 1.0 encourage the model to use new
+   * tokens, while values less than 1.0 encourage the model to repeat tokens.
    */
   repetition_penalty?: number | null;
 

From 391254a8907b5e19911a2afae4f2b51fe47b6669 Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Sat, 13 Sep 2025 02:08:06 -0400
Subject: [PATCH 10/14] Clean up code and rename sanity-checks

---
 .gitignore                                    |  1 -
 .../src/get_started_latency_breakdown.ts      | 31 +------------------
 .../scripts/sanity_checks}/README.md          |  0
 .../scripts/sanity_checks}/package.json       |  6 ++--
 .../scripts/sanity_checks}/sanity_checks.html |  0
 .../scripts/sanity_checks}/sanity_checks.ts   |  3 --
 6 files changed, 4 insertions(+), 37 deletions(-)
 rename examples/{sanity-checks => tests/scripts/sanity_checks}/README.md (100%)
 rename examples/{sanity-checks => tests/scripts/sanity_checks}/package.json (66%)
 rename examples/{sanity-checks/src => tests/scripts/sanity_checks}/sanity_checks.html (100%)
 rename examples/{sanity-checks/src => tests/scripts/sanity_checks}/sanity_checks.ts (98%)

diff --git a/.gitignore b/.gitignore
index 8de96b40..adc767f8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -324,5 +324,4 @@ node_modules
 lib
 .parcel-cache
 
-examples/tests
 **/.next
\ No newline at end of file
diff --git a/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts b/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts
index 26d9565e..104af5da 100644
--- a/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts
+++ b/examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts
@@ -45,7 +45,6 @@ async function main() {
     setLabel("init-label", report.text);
   };
   // Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`
-  // const selectedModel = "Llama-3.1-8B-Instruct-q4f32_1-MLC";
   const selectedModel = "Qwen3-0.6B-q0f32-MLC";
   const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
     selectedModel,
@@ -61,34 +60,6 @@ async function main() {
     },
   );
 
-  // Option 2: Specify your own model other than the prebuilt ones
-  // const appConfig: webllm.AppConfig = {
-  //   model_list: [
-  //     {
-  //       model: "https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC",
-  //       model_id: "Llama-3.1-8B-Instruct-q4f32_1-MLC",
-  //       model_lib:
-  //         webllm.modelLibURLPrefix +
-  //         webllm.modelVersion +
-  //         "/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm",
-  //       overrides: {
-  //         context_window_size: 2048,
-  //       },
-  //     },
-  //   ],
-  // };
-  // const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
-  //   selectedModel,
-  //   { appConfig: appConfig, initProgressCallback: initProgressCallback },
-  // );
-
-  // Option 3: Instantiate MLCEngine() and call reload() separately
-  // const engine: webllm.MLCEngineInterface = new webllm.MLCEngine({
-  //   appConfig: appConfig, // if do not specify, we use webllm.prebuiltAppConfig
-  //   initProgressCallback: initProgressCallback,
-  // });
-  // await engine.reload(selectedModel);
-
   const latencyBreakdown: LatencyBreakdown = {
     logitProcessorTime: [],
     logitBiasTime: [],
@@ -97,7 +68,7 @@ async function main() {
     totalTime: [],
     grammarBitmaskTime: [],
   };
-  // want decode_tokens_per_s, e2e_latency_s, time_per_output_token_s, completion_tokens
+
   const decodeTokensPerS: number[] = [];
   const completionTokens: number[] = [];
   const e2eLatencyS: number[] = [];
diff --git a/examples/sanity-checks/README.md b/examples/tests/scripts/sanity_checks/README.md
similarity index 100%
rename from examples/sanity-checks/README.md
rename to examples/tests/scripts/sanity_checks/README.md
diff --git a/examples/sanity-checks/package.json b/examples/tests/scripts/sanity_checks/package.json
similarity index 66%
rename from examples/sanity-checks/package.json
rename to examples/tests/scripts/sanity_checks/package.json
index b40e6464..aefb0f05 100644
--- a/examples/sanity-checks/package.json
+++ b/examples/tests/scripts/sanity_checks/package.json
@@ -1,10 +1,10 @@
 {
-  "name": "sanity-checks",
+  "name": "tests",
   "version": "0.1.0",
   "private": true,
   "scripts": {
-    "start": "parcel src/sanity_checks.html --port 8889",
-    "build": "parcel build src/sanity_checks.html --dist-dir lib"
+    "start": "parcel sanity_checks.html --port 8889",
+    "build": "parcel build sanity_checks.html --dist-dir lib"
   },
   "devDependencies": {
     "buffer": "^5.7.1",
diff --git a/examples/sanity-checks/src/sanity_checks.html b/examples/tests/scripts/sanity_checks/sanity_checks.html
similarity index 100%
rename from examples/sanity-checks/src/sanity_checks.html
rename to examples/tests/scripts/sanity_checks/sanity_checks.html
diff --git a/examples/sanity-checks/src/sanity_checks.ts b/examples/tests/scripts/sanity_checks/sanity_checks.ts
similarity index 98%
rename from examples/sanity-checks/src/sanity_checks.ts
rename to examples/tests/scripts/sanity_checks/sanity_checks.ts
index b0f51d1b..da842353 100644
--- a/examples/sanity-checks/src/sanity_checks.ts
+++ b/examples/tests/scripts/sanity_checks/sanity_checks.ts
@@ -71,7 +71,6 @@ async function testLogitProcessor(
 async function testLogitBias(modelId: string, appConfig: webllm.AppConfig) {
   // Set logit_bias to strongly favor token 0
   const prompt = "Test logit bias.";
-  // const t0 = performance.now();
   const engine: webllm.MLCEngineInterface = await createEngine(
     modelId,
     appConfig,
@@ -128,7 +127,6 @@ async function testPenalties(modelId: string, appConfig: webllm.AppConfig) {
 async function testLogprobs(modelId: string, appConfig: webllm.AppConfig) {
   // Test logprobs: check that logprobs are returned and sum to ~1 after exp
   const prompt = "Test logprobs.";
-  const t0 = performance.now();
   const engine: webllm.MLCEngineInterface = await createEngine(
     modelId,
     appConfig,
@@ -140,7 +138,6 @@ async function testLogprobs(modelId: string, appConfig: webllm.AppConfig) {
     logprobs: true,
     top_logprobs: 5,
   });
-  const t1 = performance.now();
   const logprobs = reply.choices[0]?.logprobs;
 
   let logprobsAllCloseTo1 = true;

From aa2422d9dc123a2767a9823323ef59ced567e84b Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Sat, 13 Sep 2025 02:12:40 -0400
Subject: [PATCH 11/14] Update sanity_checks package.json

---
 examples/tests/scripts/sanity_checks/package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tests/scripts/sanity_checks/package.json b/examples/tests/scripts/sanity_checks/package.json
index aefb0f05..cf86fb3d 100644
--- a/examples/tests/scripts/sanity_checks/package.json
+++ b/examples/tests/scripts/sanity_checks/package.json
@@ -1,5 +1,5 @@
 {
-  "name": "tests",
+  "name": "sanity_checks",
   "version": "0.1.0",
   "private": true,
   "scripts": {

From 891dc78768186d421ba0f8f2530fe1f54c1de1f3 Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Sat, 13 Sep 2025 02:20:21 -0400
Subject: [PATCH 12/14] Use Prettier code style

---
 .../get-started-latency-breakdown/README.md   |  2 +-
 .../scripts/sanity_checks/sanity_checks.html  | 42 ++++++++++++-------
 src/llm_chat.ts                               |  1 -
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/examples/get-started-latency-breakdown/README.md b/examples/get-started-latency-breakdown/README.md
index 0a74ea26..2a8f6967 100644
--- a/examples/get-started-latency-breakdown/README.md
+++ b/examples/get-started-latency-breakdown/README.md
@@ -1,6 +1,6 @@
 # WebLLM Get Started App
 
-This folder provides a minimum demo to show WebLLM API in a webapp setting with 
+This folder provides a minimum demo to show WebLLM API in a webapp setting with
 collection of latency statistics for individual token sampling steps.
 To try it out, you can do the following steps under this folder
 
diff --git a/examples/tests/scripts/sanity_checks/sanity_checks.html b/examples/tests/scripts/sanity_checks/sanity_checks.html
index bfbfc869..2b662f71 100644
--- a/examples/tests/scripts/sanity_checks/sanity_checks.html
+++ b/examples/tests/scripts/sanity_checks/sanity_checks.html
@@ -1,17 +1,31 @@
-<!DOCTYPE html>
+<!doctype html>
 <html lang="en">
-<head>
+  <head>
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <title>GPU sampleTokenFromLogits Tests</title>
     <style>
-        body { font-family: Arial, sans-serif; margin: 2em; }
-        .label { margin: 0.5em 0; font-weight: bold; }
-        .result { margin: 0.5em 0 1.5em 0; padding: 0.5em; background: #f4f4f4; border-radius: 4px; }
-        button { padding: 0.5em 1em; font-size: 1em; }
+      body {
+        font-family: Arial, sans-serif;
+        margin: 2em;
+      }
+      .label {
+        margin: 0.5em 0;
+        font-weight: bold;
+      }
+      .result {
+        margin: 0.5em 0 1.5em 0;
+        padding: 0.5em;
+        background: #f4f4f4;
+        border-radius: 4px;
+      }
+      button {
+        padding: 0.5em 1em;
+        font-size: 1em;
+      }
     </style>
-</head>
-<body>
+  </head>
+  <body>
     <h1>GPU sampleTokenFromLogits Tests</h1>
     <button id="run-tests">Re-run All Tests</button>
     <div class="label">Overall:</div>
@@ -25,11 +39,11 @@ <h1>GPU sampleTokenFromLogits Tests</h1>
     <div class="label">Logprobs:</div>
     <div id="logprobs-label" class="result"></div>
     <script type="module">
-        import './sanity_checks.ts';
-        document.getElementById('run-tests').onclick = () => {
-            // Reload the module to rerun tests
-            window.location.reload();
-        };
+      import "./sanity_checks.ts";
+      document.getElementById("run-tests").onclick = () => {
+        // Reload the module to rerun tests
+        window.location.reload();
+      };
     </script>
-</body>
+  </body>
 </html>
diff --git a/src/llm_chat.ts b/src/llm_chat.ts
index 7266b5e4..8f7620a8 100644
--- a/src/llm_chat.ts
+++ b/src/llm_chat.ts
@@ -1276,7 +1276,6 @@ export class LLMChatPipeline {
       temperature = Math.max(1e-6, temperature); // to prevent division by zero
 
       const numSeqs = 1;
-      const numTokens = this.appearedTokensFreq.size;
 
       const temperatures = new Float32Array([temperature]);
 

From ce95b079e761bf83886b21615c72793cc812ec5a Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Sat, 13 Sep 2025 02:30:15 -0400
Subject: [PATCH 13/14] Update .gitignore with package-lock and move sanity
 checks

---
 .gitignore                                                       | 1 +
 {examples/tests => tests}/scripts/sanity_checks/README.md        | 0
 {examples/tests => tests}/scripts/sanity_checks/package.json     | 0
 .../tests => tests}/scripts/sanity_checks/sanity_checks.html     | 0
 {examples/tests => tests}/scripts/sanity_checks/sanity_checks.ts | 0
 5 files changed, 1 insertion(+)
 rename {examples/tests => tests}/scripts/sanity_checks/README.md (100%)
 rename {examples/tests => tests}/scripts/sanity_checks/package.json (100%)
 rename {examples/tests => tests}/scripts/sanity_checks/sanity_checks.html (100%)
 rename {examples/tests => tests}/scripts/sanity_checks/sanity_checks.ts (100%)

diff --git a/.gitignore b/.gitignore
index adc767f8..2dbbf58f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -323,5 +323,6 @@ tvm_home
 node_modules
 lib
 .parcel-cache
+package-lock.json
 
 **/.next
\ No newline at end of file
diff --git a/examples/tests/scripts/sanity_checks/README.md b/tests/scripts/sanity_checks/README.md
similarity index 100%
rename from examples/tests/scripts/sanity_checks/README.md
rename to tests/scripts/sanity_checks/README.md
diff --git a/examples/tests/scripts/sanity_checks/package.json b/tests/scripts/sanity_checks/package.json
similarity index 100%
rename from examples/tests/scripts/sanity_checks/package.json
rename to tests/scripts/sanity_checks/package.json
diff --git a/examples/tests/scripts/sanity_checks/sanity_checks.html b/tests/scripts/sanity_checks/sanity_checks.html
similarity index 100%
rename from examples/tests/scripts/sanity_checks/sanity_checks.html
rename to tests/scripts/sanity_checks/sanity_checks.html
diff --git a/examples/tests/scripts/sanity_checks/sanity_checks.ts b/tests/scripts/sanity_checks/sanity_checks.ts
similarity index 100%
rename from examples/tests/scripts/sanity_checks/sanity_checks.ts
rename to tests/scripts/sanity_checks/sanity_checks.ts

From fa4c80f35bc6b44e280f3560899279b7064f8277 Mon Sep 17 00:00:00 2001
From: akaashrp <43900735+akaashrp@users.noreply.github.com>
Date: Sat, 13 Sep 2025 02:35:24 -0400
Subject: [PATCH 14/14] Added tests-specific .gitignore

---
 .gitignore       | 1 -
 tests/.gitignore | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 tests/.gitignore

diff --git a/.gitignore b/.gitignore
index 2dbbf58f..adc767f8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -323,6 +323,5 @@ tvm_home
 node_modules
 lib
 .parcel-cache
-package-lock.json
 
 **/.next
\ No newline at end of file
diff --git a/tests/.gitignore b/tests/.gitignore
new file mode 100644
index 00000000..d8b83df9
--- /dev/null
+++ b/tests/.gitignore
@@ -0,0 +1 @@
+package-lock.json