mlc-ai
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/get-started-latency-breakdown/README.md‎
Lines changed: 15 additions & 0 deletions b/‎examples/get-started-latency-breakdown/README.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎examples/get-started-latency-breakdown/package.json‎
Lines changed: 20 additions & 0 deletions b/‎examples/get-started-latency-breakdown/package.json‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎examples/get-started-latency-breakdown/src/get_started_latency_breakdown.html‎
Lines changed: 23 additions & 0 deletions b/‎examples/get-started-latency-breakdown/src/get_started_latency_breakdown.html‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts‎
Lines changed: 135 additions & 0 deletions b/‎examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎src/config.ts‎
Lines changed: 2 additions & 1 deletion b/‎src/config.ts‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/engine.ts‎
Lines changed: 23 additions & 0 deletions b/‎src/engine.ts‎
Lines changed: 23 additions & 0 deletions
@@ -324,5 +324,4 @@ node_modules
 lib
 .parcel-cache
 
-examples/tests
 **/.next
@@ -0,0 +1,15 @@
+# WebLLM Get Started App
+
+This folder provides a minimum demo to show WebLLM API in a webapp setting with
+collection of latency statistics for individual token sampling steps.
+To try it out, you can do the following steps under this folder
+
+```bash
+npm install
+npm start
+```
+
+Note if you would like to hack WebLLM core package.
+You can change web-llm dependencies as `"file:../.."`, and follow the build from source
+instruction in the project to build webllm locally. This option is only recommended
+if you would like to hack WebLLM core package.
@@ -0,0 +1,20 @@
+{
+  "name": "get-started-latency-breakdown",
+  "version": "0.1.0",
+  "private": true,
+  "scripts": {
+    "start": "parcel src/get_started_latency_breakdown.html  --port 8888",
+    "build": "parcel build src/get_started_latency_breakdown.html --dist-dir lib"
+  },
+  "devDependencies": {
+    "buffer": "^5.7.1",
+    "parcel": "^2.8.3",
+    "process": "^0.11.10",
+    "tslib": "^2.3.1",
+    "typescript": "^4.9.5",
+    "url": "^0.11.3"
+  },
+  "dependencies": {
+    "@mlc-ai/web-llm": "^0.2.79"
+  }
+}
@@ -0,0 +1,23 @@
+<!doctype html>
+<html>
+  <script>
+    webLLMGlobal = {};
+  </script>
+  <body>
+    <h2>WebLLM Test Page</h2>
+    Open console to see output
+    <br />
+    <br />
+    <label id="init-label"> </label>
+
+    <h3>Prompt</h3>
+    <label id="prompt-label"> </label>
+
+    <h3>Response</h3>
+    <label id="generate-label"> </label>
+    <br />
+    <label id="stats-label"> </label>
+
+    <script type="module" src="./get_started_latency_breakdown.ts"></script>
+  </body>
+</html>
@@ -0,0 +1,135 @@
+import * as webllm from "@mlc-ai/web-llm";
+
+function setLabel(id: string, text: string) {
+  const label = document.getElementById(id);
+  if (label == null) {
+    throw Error("Cannot find label " + id);
+  }
+  label.innerText = text;
+}
+
+type LatencyBreakdown = {
+  logitProcessorTime: number[];
+  logitBiasTime: number[];
+  penaltyTime: number[];
+  sampleTime: number[];
+  totalTime: number[];
+  grammarBitmaskTime: number[];
+};
+function computeStats(
+  latency_breakdown: LatencyBreakdown,
+): Record<string, any> {
+  function _computeStats(arr: number[]) {
+    if (!arr.length) return undefined;
+    const sorted = [...arr].sort((a, b) => a - b);
+    const sum = arr.reduce((a, b) => a + b, 0);
+    const avg = sum / arr.length;
+    const min = sorted[0];
+    const max = sorted[sorted.length - 1];
+    const p99 = sorted[Math.floor(0.99 * (sorted.length - 1))];
+    return { avg, min, max, p99 };
+  }
+
+  const latencyStats: Record<string, any> = {};
+  for (const key of Object.keys(latency_breakdown)) {
+    const arr = (latency_breakdown as any)[key];
+    if (Array.isArray(arr) && arr.length > 0) {
+      latencyStats[key] = _computeStats(arr);
+    }
+  }
+  return latencyStats;
+}
+
+async function main() {
+  const initProgressCallback = (report: webllm.InitProgressReport) => {
+    setLabel("init-label", report.text);
+  };
+  // Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`
+  const selectedModel = "Qwen3-0.6B-q0f32-MLC";
+  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
+    selectedModel,
+    {
+      initProgressCallback: initProgressCallback,
+      logLevel: "INFO", // specify the log level
+    },
+    // customize kv cache, use either context_window_size or sliding_window_size (with attention sink)
+    {
+      context_window_size: 2048,
+      // sliding_window_size: 1024,
+      // attention_sink_size: 4,
+    },
+  );
+
+  const latencyBreakdown: LatencyBreakdown = {
+    logitProcessorTime: [],
+    logitBiasTime: [],
+    penaltyTime: [],
+    sampleTime: [],
+    totalTime: [],
+    grammarBitmaskTime: [],
+  };
+
+  const decodeTokensPerS: number[] = [];
+  const completionTokens: number[] = [];
+  const e2eLatencyS: number[] = [];
+  const timePerOutputTokenS: number[] = [];
+
+  const numTrials = 20;
+  for (let i = 0; i < numTrials; i++) {
+    console.log(`Trial ${i + 1} / ${numTrials}`);
+    const reply0 = await engine.chat.completions.create({
+      messages: [{ role: "user", content: "List twenty US states." }],
+      // below configurations are all optional
+      n: 1,
+      temperature: 0,
+      max_tokens: 2048,
+      // 46510 and 7188 are "California", and 8421 and 51325 are "Texas" in Llama-3.1-8B-Instruct
+      // So we would have a higher chance of seeing the latter two, but never the first in the answer
+      // logit_bias: {
+      //   "46510": -100,
+      //   "7188": -100,
+      //   "8421": 5,
+      //   "41325": 5,
+      // },
+      top_p: 0.8,
+      logprobs: true,
+      top_logprobs: 2,
+      frequency_penalty: 1.2,
+      presence_penalty: 1.0,
+      repetition_penalty: 1.1,
+    });
+
+    const logitProcessorTime =
+      reply0.usage?.extra.latencyBreakdown?.logitProcessorTime;
+    const logitBiasTime = reply0.usage?.extra.latencyBreakdown?.logitBiasTime;
+    const penaltyTime = reply0.usage?.extra.latencyBreakdown?.penaltyTime;
+    const sampleTime = reply0.usage?.extra.latencyBreakdown?.sampleTime;
+    const totalTime = reply0.usage?.extra.latencyBreakdown?.totalTime;
+    const grammarBitmaskTime =
+      reply0.usage?.extra.latencyBreakdown?.grammarBitmaskTime;
+
+    latencyBreakdown.logitProcessorTime.push(...(logitProcessorTime || []));
+    latencyBreakdown.logitBiasTime.push(...(logitBiasTime || []));
+    latencyBreakdown.penaltyTime.push(...(penaltyTime || []));
+    latencyBreakdown.sampleTime.push(...(sampleTime || []));
+    latencyBreakdown.totalTime.push(...(totalTime || []));
+    latencyBreakdown.grammarBitmaskTime.push(...(grammarBitmaskTime || []));
+
+    decodeTokensPerS.push(reply0.usage?.extra.decode_tokens_per_s || 0);
+    e2eLatencyS.push(reply0.usage?.extra.e2e_latency_s || 0);
+    timePerOutputTokenS.push(reply0.usage?.extra.time_per_output_token_s || 0);
+    completionTokens.push(reply0.usage?.completion_tokens || 0);
+  }
+
+  const latencyStats: { [key: string]: number } =
+    computeStats(latencyBreakdown);
+  console.log("Latency stats: ", latencyStats);
+  console.log("Decode tokens per second: ", decodeTokensPerS);
+  console.log("Completion tokens: ", completionTokens);
+  console.log("E2E latency (s): ", e2eLatencyS);
+  console.log("Time per output token (s): ", timePerOutputTokenS);
+
+  // To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)`
+}
+
+main();
@@ -126,7 +126,7 @@ export interface MLCEngineConfig {
  */
 export interface GenerationConfig {
   // Only used in MLC
-  repetition_penalty?: number;
+  repetition_penalty?: number | null;
   ignore_eos?: boolean;
   // Shared by MLC and OpenAI APIs
   top_p?: number | null;
@@ -143,6 +143,7 @@ export interface GenerationConfig {
   response_format?: ResponseFormat | null;
   // extra_body in ChatCompletionsRequest
   enable_thinking?: boolean | null;
+  enable_latency_breakdown?: boolean | null;
 }
 
 export function postInitAndCheckGenerationConfigValues(
 
@@ -41,6 +41,7 @@ import {
   MLCEngineInterface,
   LogitProcessor,
   LogLevel,
+  LatencyBreakdown,
 } from "./types";
 import {
   compareConversationObject,
@@ -694,12 +695,18 @@ export class MLCEngine implements MLCEngineInterface {
       const decode_time = pipeline.getCurRoundDecodingTotalTime();
       const grammar_per_token_s =
         pipeline.getCurRoundGrammarPerTokenTotalTime();
+      const latencyBreakdown: LatencyBreakdown =
+        pipeline.getCurRoundLatencyBreakdown();
+
       const defaultExtra = {
         e2e_latency_s: (Date.now() - timeReceived) / 1000,
         prefill_tokens_per_s: prefill_tokens_per_s,
         decode_tokens_per_s: decode_tokens_per_s,
         time_to_first_token_s: prefill_time,
         time_per_output_token_s: decode_time / completion_tokens,
+        latencyBreakdown: request.extra_body?.enable_latency_breakdown
+          ? latencyBreakdown
+          : undefined,
       };
       const usage: CompletionUsage = {
         completion_tokens: completion_tokens,
@@ -783,6 +790,7 @@ export class MLCEngine implements MLCEngineInterface {
     const genConfig: GenerationConfig = {
       frequency_penalty: request.frequency_penalty,
       presence_penalty: request.presence_penalty,
+      repetition_penalty: request.repetition_penalty,
       max_tokens: request.max_tokens,
       stop: request.stop,
       top_p: request.top_p,
@@ -793,6 +801,7 @@ export class MLCEngine implements MLCEngineInterface {
       response_format: request.response_format,
       ignore_eos: request.ignore_eos,
       enable_thinking: request.extra_body?.enable_thinking,
+      enable_latency_breakdown: request.extra_body?.enable_latency_breakdown,
     };
 
     // 0.5 Block wait until this pipeline finishes all previous requests
@@ -890,12 +899,19 @@ export class MLCEngine implements MLCEngineInterface {
         "response_format" in request &&
         (request.response_format?.type === "grammar" ||
           request.response_format?.type === "json_object");
+
+      const latencyBreakdown: LatencyBreakdown =
+        selectedPipeline.getCurRoundLatencyBreakdown();
+
       const defaultExtra = {
         e2e_latency_s: (Date.now() - timeReceived) / 1000,
         prefill_tokens_per_s: prompt_tokens / prefill_time,
         decode_tokens_per_s: completion_tokens / decode_time,
         time_to_first_token_s: prefill_time,
         time_per_output_token_s: decode_time / completion_tokens,
+        latencyBreakdown: request.extra_body?.enable_latency_breakdown
+          ? latencyBreakdown
+          : undefined,
       };
       const response: ChatCompletion = {
         id: crypto.randomUUID(),
@@ -958,6 +974,7 @@ export class MLCEngine implements MLCEngineInterface {
     const genConfig: GenerationConfig = {
       frequency_penalty: request.frequency_penalty,
       presence_penalty: request.presence_penalty,
+      repetition_penalty: request.repetition_penalty,
       max_tokens: request.max_tokens,
       stop: request.stop,
       top_p: request.top_p,
@@ -1030,6 +1047,9 @@ export class MLCEngine implements MLCEngineInterface {
         decode_time += selectedPipeline.getCurRoundDecodingTotalTime();
       }
 
+      const latencyBreakdown: LatencyBreakdown =
+        selectedPipeline.getCurRoundLatencyBreakdown();
+
       const response: Completion = {
         id: crypto.randomUUID(),
         choices: choices,
@@ -1046,6 +1066,9 @@ export class MLCEngine implements MLCEngineInterface {
             decode_tokens_per_s: completion_tokens / decode_time,
             time_to_first_token_s: prefill_time,
             time_per_output_token_s: decode_time / completion_tokens,
+            latencyBreakdown: request.extra_body?.enable_latency_breakdown
+              ? latencyBreakdown
+              : undefined,
           },
         } as CompletionUsage,
       };