Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 120 additions & 32 deletions examples/benchmark/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -592,20 +592,22 @@ <h2>Comparison Charts</h2>
let flareLib = null;

async function loadFlareEngine(config) {
const T = (label) => console.log(`[flare-trace ${Math.round(performance.now())}ms] ${label}`);
T('loadFlareEngine: enter');
if (!flareLib) {
log('Loading @sauravpanda/flare WASM from CDN...', 'info');
T('fetching flare_web.js from CDN');
const CDN = 'https://cdn.jsdelivr.net/npm/@sauravpanda/flare@0.2.13/pkg';
const wasmUrl = `${CDN}/flare_web_bg.wasm`;

// Fetch the JS module source, patch the WASM URL, and load via blob
// to avoid cross-origin ES module import restrictions.
const jsResp = await fetch(`${CDN}/flare_web.js`, { cache: 'no-cache' });
if (!jsResp.ok) throw new Error(`HTTP ${jsResp.status} fetching flare_web.js`);
let jsSrc = await jsResp.text();
log(`Fetched flare_web.js (${(jsSrc.length / 1024).toFixed(0)} KB)`, 'info');
T(`flare_web.js fetched (${(jsSrc.length / 1024).toFixed(0)} KB)`);

// Patch import.meta.url so the WASM binary resolves to the CDN.
jsSrc = jsSrc.replaceAll('import.meta.url', `'${CDN}/flare_web.js'`);
// Fix wasm-pack codegen bug: a JSDoc block contains "/* done */"
// Workaround for wasm-pack codegen: a JSDoc block contains "/* done */"
// which prematurely closes the outer /** */ comment.
jsSrc = jsSrc.replaceAll('/* done */', '/* done -/');

Expand All @@ -615,54 +617,128 @@ <h2>Comparison Charts</h2>
flareLib = await import(/* webpackIgnore: true */ blobUrl);
} catch (importErr) {
URL.revokeObjectURL(blobUrl);
log(`Blob import failed: ${importErr.message}`, 'error');
log('Trying direct CDN import...', 'info');
T(`blob import failed, trying direct CDN import: ${importErr.message}`);
flareLib = await import(`${CDN}/flare_web.js`);
}
URL.revokeObjectURL(blobUrl);

T('flare_web.js imported, calling default init');
await flareLib.default(wasmUrl);
T('WASM init complete');
log('Flare WASM initialized.', 'success');
} else {
T('flareLib already loaded (cached)');
}

log(`Downloading GGUF model: ${config.url.split('/').pop()}`, 'info');
const t0 = performance.now();

const resp = await fetch(config.url);
if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching model`);
const contentLength = parseInt(resp.headers.get('content-length') || '0', 10);
const reader = resp.body.getReader();
const chunks = [];
let received = 0;

while (true) {
const { done, value } = await reader.read();
if (done) break;
chunks.push(value);
received += value.length;
if (contentLength > 0) {
updateProgress(Math.round((received / contentLength) * 100));
}
// OPFS caching: on first run the GGUF streams directly to disk;
// subsequent runs skip the 138 MB download. Big UX win for
// iterative benchmarking on the same model. Falls back to
// in-memory fetch on browsers without OPFS (Firefox private mode).
const cacheName = 'flare-' + config.url.split('/').pop();
let bytes = null;
let opfsDir = null;
T('probing OPFS');
try {
opfsDir = await navigator.storage.getDirectory();
T('OPFS available');
} catch (e) {
T(`OPFS unavailable: ${e}`);
}

const bytes = new Uint8Array(received);
let offset = 0;
for (const chunk of chunks) {
bytes.set(chunk, offset);
offset += chunk.length;
}
if (opfsDir) {
let fileHandle = null;
let cached = false;
try {
fileHandle = await opfsDir.getFileHandle(cacheName, { create: false });
const info = await fileHandle.getFile();
T(`OPFS cache file exists, size=${info.size}`);
if (info.size > 100 * 1024 * 1024) {
cached = true;
log(`Using cached GGUF from OPFS (${(info.size / 1024 / 1024).toFixed(1)} MB)`, 'success');
updateProgress(100);
bytes = new Uint8Array(await info.arrayBuffer());
}
} catch (e) {
T(`no OPFS cache hit: ${e.message || e}`);
}

log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF...`, 'info');
if (!cached) {
log(`Downloading GGUF model: ${config.url.split('/').pop()}`, 'info');
fileHandle = await opfsDir.getFileHandle(cacheName, { create: true });
const writable = await fileHandle.createWritable();
try {
const resp = await fetch(config.url);
if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching model`);
const contentLength = parseInt(resp.headers.get('content-length') || '0', 10);
const reader = resp.body.getReader();
const chunks = [];
let received = 0;
while (true) {
const { done, value } = await reader.read();
if (done) break;
await writable.write(value);
chunks.push(value);
received += value.length;
if (contentLength > 0) {
updateProgress(Math.round((received / contentLength) * 100));
}
}
await writable.close();
T(`downloaded ${(received / 1024 / 1024).toFixed(1)} MB to OPFS`);
bytes = new Uint8Array(received);
let off = 0;
for (const c of chunks) { bytes.set(c, off); off += c.length; }
chunks.length = 0;
Comment on lines +671 to +692
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Streaming to OPFS while also buffering every chunk defeats the memory benefit.

The download loop both await writable.write(value) and chunks.push(value), then allocates a fresh new Uint8Array(received) and copies the chunks into it. Peak memory is roughly 3× the model size during the copy (chunks array + new contiguous buffer + OPFS internal buffering), instead of the ~1× the OPFS streaming was meant to enable.

For SmolLM2-135M Q8_0 (~138 MB) this is mostly a wart, but the same code path is used for llama-3.2-1b Q8_0 (~1.3 GB), where this is likely to OOM tabs that would otherwise succeed.

Mirror the cache-hit path: after writable.close(), just read the bytes back from OPFS.

♻️ Proposed fix
             const reader = resp.body.getReader();
-            const chunks = [];
             let received = 0;
             while (true) {
               const { done, value } = await reader.read();
               if (done) break;
               await writable.write(value);
-              chunks.push(value);
               received += value.length;
               if (contentLength > 0) {
                 updateProgress(Math.round((received / contentLength) * 100));
               }
             }
             await writable.close();
             T(`downloaded ${(received / 1024 / 1024).toFixed(1)} MB to OPFS`);
-            bytes = new Uint8Array(received);
-            let off = 0;
-            for (const c of chunks) { bytes.set(c, off); off += c.length; }
-            chunks.length = 0;
+            const cachedFile = await fileHandle.getFile();
+            bytes = new Uint8Array(await cachedFile.arrayBuffer());
             log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB`, 'info');
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@examples/benchmark/index.html` around lines 671 - 692, The download loop
currently both writes each streamed chunk to OPFS (using writable.write) and
also accumulates them in the chunks array, then builds a new Uint8Array(bytes)
and copies chunks into it, causing ~3× memory peak; remove the in-memory
buffering (do not push into chunks or build the contiguous bytes from them),
keep streaming to writable and progress updates (resp, reader, writable,
contentLength, updateProgress), then after await writable.close() read the file
back from OPFS the same way the cache-hit path does to obtain a single
contiguous Uint8Array (replacing the bytes/chunks copy logic) and preserve the
T(...) size log by using the read result.

log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB`, 'info');
} catch (err) {
try { await writable.abort(); } catch {}
throw err;
}
}
} else {
// OPFS fallback: regular in-memory download.
log(`Downloading GGUF model: ${config.url.split('/').pop()}`, 'info');
const resp = await fetch(config.url);
if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching model`);
const contentLength = parseInt(resp.headers.get('content-length') || '0', 10);
const reader = resp.body.getReader();
const chunks = [];
let received = 0;
while (true) {
const { done, value } = await reader.read();
if (done) break;
chunks.push(value);
received += value.length;
if (contentLength > 0) updateProgress(Math.round((received / contentLength) * 100));
}
bytes = new Uint8Array(received);
let offset = 0;
for (const c of chunks) { bytes.set(c, offset); offset += c.length; }
chunks.length = 0;
}

log(`Parsing ${(bytes.length / 1024 / 1024).toFixed(1)} MB GGUF...`, 'info');
T(`calling FlareEngine.load (bytes=${bytes.length})`);
flareEngine = flareLib.FlareEngine.load(bytes);

T('FlareEngine.load returned');
bytes = null; // drop source buffer before warmup / GPU upload

// GPU prefill is currently deadlocked on Chrome main-thread wasm due
// to wgpu's sync readback pattern (map_async callback can't fire
// while we're mid-sync-WASM-call). Force CPU path until the async
// readback refactor lands upstream.
const USE_GPU = new URL(location.href).searchParams.get('gpu') === '1';
T(`init_gpu: ${USE_GPU ? 'ENABLED via ?gpu=1' : 'SKIPPED (CPU only)'}`);
try {
const gpuOk = await flareEngine.init_gpu();
const gpuOk = USE_GPU ? await flareEngine.init_gpu() : false;
T(`init_gpu complete, gpuOk=${gpuOk}`);
const info = JSON.parse(flareEngine.backend_info());
console.log('[Flare] backend_info:', info);
log(`Flare backend: ${JSON.stringify(info)}`, gpuOk ? 'success' : 'info');
// Enable profiling; runFlareInference reads it after the first prefill
flareEngine.enable_prefill_profiling();
T('prefill profiling enabled');
} catch (e) {
console.warn('[Flare] GPU init failed:', e);
log('Flare GPU init failed — using CPU SIMD', 'info');
Expand All @@ -674,20 +750,27 @@ <h2>Comparison Charts</h2>
}

async function runFlareInference(prompt, opts) {
const tInf = (label) => console.log(`[flare-trace ${Math.round(performance.now())}ms] inf: ${label}`);
tInf('enter runFlareInference');
const t0 = performance.now();

tInf('encode_text begin');
const promptIds = flareEngine.encode_text(prompt);
tInf(`encode_text done, ${promptIds?.length || 0} tokens`);
if (!promptIds || promptIds.length === 0) {
throw new Error('Flare tokenizer failed to encode prompt');
}

tInf('reset()');
flareEngine.reset();
tInf(`begin_stream_with_params (maxTokens=${opts.maxTokens}) — this runs prefill`);
flareEngine.begin_stream_with_params(
promptIds,
opts.maxTokens,
opts.temperature || 0.001,
1.0, 40, 1.0, 0.0,
);
tInf('begin_stream_with_params returned (prefill done)');

const firstTokenTime = performance.now() - t0;

Expand All @@ -708,12 +791,17 @@ <h2>Comparison Charts</h2>
let tokenCount = 0;
let output = '';

tInf('entering decode loop');
while (!flareEngine.stream_done) {
const id = flareEngine.next_token();
if (id === undefined) break;
tokenCount++;
output += flareEngine.decode_token_chunk(id);
if (tokenCount === 1 || tokenCount === 5 || tokenCount % 32 === 0) {
tInf(`decoded ${tokenCount} tokens`);
}
}
tInf(`decode loop exit: ${tokenCount} tokens, stream_done=${flareEngine.stream_done}`);

const totalTime = performance.now() - t0;
const decodeTime = totalTime - firstTokenTime;
Expand Down
Loading