Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -324,5 +324,4 @@ node_modules
lib
.parcel-cache

examples/tests
**/.next
15 changes: 15 additions & 0 deletions examples/get-started-latency-breakdown/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# WebLLM Get Started App

This folder provides a minimum demo to show WebLLM API in a webapp setting with
collection of latency statistics for individual token sampling steps.
To try it out, you can do the following steps under this folder

```bash
npm install
npm start
```

Note if you would like to hack WebLLM core package.
You can change web-llm dependencies as `"file:../.."`, and follow the build from source
instruction in the project to build webllm locally. This option is only recommended
if you would like to hack WebLLM core package.
20 changes: 20 additions & 0 deletions examples/get-started-latency-breakdown/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"name": "get-started-latency-breakdown",
"version": "0.1.0",
"private": true,
"scripts": {
"start": "parcel src/get_started_latency_breakdown.html --port 8888",
"build": "parcel build src/get_started_latency_breakdown.html --dist-dir lib"
},
"devDependencies": {
"buffer": "^5.7.1",
"parcel": "^2.8.3",
"process": "^0.11.10",
"tslib": "^2.3.1",
"typescript": "^4.9.5",
"url": "^0.11.3"
},
"dependencies": {
"@mlc-ai/web-llm": "^0.2.79"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<!doctype html>
<html>
<script>
webLLMGlobal = {};
</script>
<body>
<h2>WebLLM Test Page</h2>
Open console to see output
<br />
<br />
<label id="init-label"> </label>

<h3>Prompt</h3>
<label id="prompt-label"> </label>

<h3>Response</h3>
<label id="generate-label"> </label>
<br />
<label id="stats-label"> </label>

<script type="module" src="./get_started_latency_breakdown.ts"></script>
</body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import * as webllm from "@mlc-ai/web-llm";

function setLabel(id: string, text: string) {
const label = document.getElementById(id);
if (label == null) {
throw Error("Cannot find label " + id);
}
label.innerText = text;
}

type LatencyBreakdown = {
logitProcessorTime: number[];
logitBiasTime: number[];
penaltyTime: number[];
sampleTime: number[];
totalTime: number[];
grammarBitmaskTime: number[];
};
function computeStats(
latency_breakdown: LatencyBreakdown,
): Record<string, any> {
function _computeStats(arr: number[]) {
if (!arr.length) return undefined;
const sorted = [...arr].sort((a, b) => a - b);
const sum = arr.reduce((a, b) => a + b, 0);
const avg = sum / arr.length;
const min = sorted[0];
const max = sorted[sorted.length - 1];
const p99 = sorted[Math.floor(0.99 * (sorted.length - 1))];
return { avg, min, max, p99 };
}

const latencyStats: Record<string, any> = {};
for (const key of Object.keys(latency_breakdown)) {
const arr = (latency_breakdown as any)[key];
if (Array.isArray(arr) && arr.length > 0) {
latencyStats[key] = _computeStats(arr);
}
}
return latencyStats;
}

async function main() {
const initProgressCallback = (report: webllm.InitProgressReport) => {
setLabel("init-label", report.text);
};
// Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`
const selectedModel = "Qwen3-0.6B-q0f32-MLC";
const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(
selectedModel,
{
initProgressCallback: initProgressCallback,
logLevel: "INFO", // specify the log level
},
// customize kv cache, use either context_window_size or sliding_window_size (with attention sink)
{
context_window_size: 2048,
// sliding_window_size: 1024,
// attention_sink_size: 4,
},
);

const latencyBreakdown: LatencyBreakdown = {
logitProcessorTime: [],
logitBiasTime: [],
penaltyTime: [],
sampleTime: [],
totalTime: [],
grammarBitmaskTime: [],
};

const decodeTokensPerS: number[] = [];
const completionTokens: number[] = [];
const e2eLatencyS: number[] = [];
const timePerOutputTokenS: number[] = [];

const numTrials = 20;
for (let i = 0; i < numTrials; i++) {
console.log(`Trial ${i + 1} / ${numTrials}`);
const reply0 = await engine.chat.completions.create({
messages: [{ role: "user", content: "List twenty US states." }],
// below configurations are all optional
n: 1,
temperature: 0,
max_tokens: 2048,
// 46510 and 7188 are "California", and 8421 and 51325 are "Texas" in Llama-3.1-8B-Instruct
// So we would have a higher chance of seeing the latter two, but never the first in the answer
// logit_bias: {
// "46510": -100,
// "7188": -100,
// "8421": 5,
// "41325": 5,
// },
top_p: 0.8,
logprobs: true,
top_logprobs: 2,
frequency_penalty: 1.2,
presence_penalty: 1.0,
repetition_penalty: 1.1,
});

const logitProcessorTime =
reply0.usage?.extra.latencyBreakdown?.logitProcessorTime;
const logitBiasTime = reply0.usage?.extra.latencyBreakdown?.logitBiasTime;
const penaltyTime = reply0.usage?.extra.latencyBreakdown?.penaltyTime;
const sampleTime = reply0.usage?.extra.latencyBreakdown?.sampleTime;
const totalTime = reply0.usage?.extra.latencyBreakdown?.totalTime;
const grammarBitmaskTime =
reply0.usage?.extra.latencyBreakdown?.grammarBitmaskTime;

latencyBreakdown.logitProcessorTime.push(...(logitProcessorTime || []));
latencyBreakdown.logitBiasTime.push(...(logitBiasTime || []));
latencyBreakdown.penaltyTime.push(...(penaltyTime || []));
latencyBreakdown.sampleTime.push(...(sampleTime || []));
latencyBreakdown.totalTime.push(...(totalTime || []));
latencyBreakdown.grammarBitmaskTime.push(...(grammarBitmaskTime || []));

decodeTokensPerS.push(reply0.usage?.extra.decode_tokens_per_s || 0);
e2eLatencyS.push(reply0.usage?.extra.e2e_latency_s || 0);
timePerOutputTokenS.push(reply0.usage?.extra.time_per_output_token_s || 0);
completionTokens.push(reply0.usage?.completion_tokens || 0);
}

const latencyStats: { [key: string]: number } =
computeStats(latencyBreakdown);
console.log("Latency stats: ", latencyStats);
console.log("Decode tokens per second: ", decodeTokensPerS);
console.log("Completion tokens: ", completionTokens);
console.log("E2E latency (s): ", e2eLatencyS);
console.log("Time per output token (s): ", timePerOutputTokenS);

// To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)`
}

main();
3 changes: 2 additions & 1 deletion src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ export interface MLCEngineConfig {
*/
export interface GenerationConfig {
// Only used in MLC
repetition_penalty?: number;
repetition_penalty?: number | null;
ignore_eos?: boolean;
// Shared by MLC and OpenAI APIs
top_p?: number | null;
Expand All @@ -143,6 +143,7 @@ export interface GenerationConfig {
response_format?: ResponseFormat | null;
// extra_body in ChatCompletionsRequest
enable_thinking?: boolean | null;
enable_latency_breakdown?: boolean | null;
}

export function postInitAndCheckGenerationConfigValues(
Expand Down
23 changes: 23 additions & 0 deletions src/engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import {
MLCEngineInterface,
LogitProcessor,
LogLevel,
LatencyBreakdown,
} from "./types";
import {
compareConversationObject,
Expand Down Expand Up @@ -694,12 +695,18 @@ export class MLCEngine implements MLCEngineInterface {
const decode_time = pipeline.getCurRoundDecodingTotalTime();
const grammar_per_token_s =
pipeline.getCurRoundGrammarPerTokenTotalTime();
const latencyBreakdown: LatencyBreakdown =
pipeline.getCurRoundLatencyBreakdown();

const defaultExtra = {
e2e_latency_s: (Date.now() - timeReceived) / 1000,
prefill_tokens_per_s: prefill_tokens_per_s,
decode_tokens_per_s: decode_tokens_per_s,
time_to_first_token_s: prefill_time,
time_per_output_token_s: decode_time / completion_tokens,
latencyBreakdown: request.extra_body?.enable_latency_breakdown
? latencyBreakdown
: undefined,
};
const usage: CompletionUsage = {
completion_tokens: completion_tokens,
Expand Down Expand Up @@ -783,6 +790,7 @@ export class MLCEngine implements MLCEngineInterface {
const genConfig: GenerationConfig = {
frequency_penalty: request.frequency_penalty,
presence_penalty: request.presence_penalty,
repetition_penalty: request.repetition_penalty,
max_tokens: request.max_tokens,
stop: request.stop,
top_p: request.top_p,
Expand All @@ -793,6 +801,7 @@ export class MLCEngine implements MLCEngineInterface {
response_format: request.response_format,
ignore_eos: request.ignore_eos,
enable_thinking: request.extra_body?.enable_thinking,
enable_latency_breakdown: request.extra_body?.enable_latency_breakdown,
};

// 0.5 Block wait until this pipeline finishes all previous requests
Expand Down Expand Up @@ -890,12 +899,19 @@ export class MLCEngine implements MLCEngineInterface {
"response_format" in request &&
(request.response_format?.type === "grammar" ||
request.response_format?.type === "json_object");

const latencyBreakdown: LatencyBreakdown =
selectedPipeline.getCurRoundLatencyBreakdown();

const defaultExtra = {
e2e_latency_s: (Date.now() - timeReceived) / 1000,
prefill_tokens_per_s: prompt_tokens / prefill_time,
decode_tokens_per_s: completion_tokens / decode_time,
time_to_first_token_s: prefill_time,
time_per_output_token_s: decode_time / completion_tokens,
latencyBreakdown: request.extra_body?.enable_latency_breakdown
? latencyBreakdown
: undefined,
};
const response: ChatCompletion = {
id: crypto.randomUUID(),
Expand Down Expand Up @@ -958,6 +974,7 @@ export class MLCEngine implements MLCEngineInterface {
const genConfig: GenerationConfig = {
frequency_penalty: request.frequency_penalty,
presence_penalty: request.presence_penalty,
repetition_penalty: request.repetition_penalty,
max_tokens: request.max_tokens,
stop: request.stop,
top_p: request.top_p,
Expand Down Expand Up @@ -1030,6 +1047,9 @@ export class MLCEngine implements MLCEngineInterface {
decode_time += selectedPipeline.getCurRoundDecodingTotalTime();
}

const latencyBreakdown: LatencyBreakdown =
selectedPipeline.getCurRoundLatencyBreakdown();

const response: Completion = {
id: crypto.randomUUID(),
choices: choices,
Expand All @@ -1046,6 +1066,9 @@ export class MLCEngine implements MLCEngineInterface {
decode_tokens_per_s: completion_tokens / decode_time,
time_to_first_token_s: prefill_time,
time_per_output_token_s: decode_time / completion_tokens,
latencyBreakdown: request.extra_body?.enable_latency_breakdown
? latencyBreakdown
: undefined,
},
} as CompletionUsage,
};
Expand Down
Loading