diff --git a/examples/benchmark/index.html b/examples/benchmark/index.html index f088607..afe0143 100644 --- a/examples/benchmark/index.html +++ b/examples/benchmark/index.html @@ -592,20 +592,22 @@

Comparison Charts

let flareLib = null; async function loadFlareEngine(config) { + const T = (label) => console.log(`[flare-trace ${Math.round(performance.now())}ms] ${label}`); + T('loadFlareEngine: enter'); if (!flareLib) { log('Loading @sauravpanda/flare WASM from CDN...', 'info'); + T('fetching flare_web.js from CDN'); const CDN = 'https://cdn.jsdelivr.net/npm/@sauravpanda/flare@0.2.13/pkg'; const wasmUrl = `${CDN}/flare_web_bg.wasm`; - // Fetch the JS module source, patch the WASM URL, and load via blob - // to avoid cross-origin ES module import restrictions. const jsResp = await fetch(`${CDN}/flare_web.js`, { cache: 'no-cache' }); if (!jsResp.ok) throw new Error(`HTTP ${jsResp.status} fetching flare_web.js`); let jsSrc = await jsResp.text(); - log(`Fetched flare_web.js (${(jsSrc.length / 1024).toFixed(0)} KB)`, 'info'); + T(`flare_web.js fetched (${(jsSrc.length / 1024).toFixed(0)} KB)`); + // Patch import.meta.url so the WASM binary resolves to the CDN. jsSrc = jsSrc.replaceAll('import.meta.url', `'${CDN}/flare_web.js'`); - // Fix wasm-pack codegen bug: a JSDoc block contains "/* done */" + // Workaround for wasm-pack codegen: a JSDoc block contains "/* done */" // which prematurely closes the outer /** */ comment. jsSrc = jsSrc.replaceAll('/* done */', '/* done -/'); @@ -615,54 +617,128 @@

Comparison Charts

flareLib = await import(/* webpackIgnore: true */ blobUrl); } catch (importErr) { URL.revokeObjectURL(blobUrl); - log(`Blob import failed: ${importErr.message}`, 'error'); - log('Trying direct CDN import...', 'info'); + T(`blob import failed, trying direct CDN import: ${importErr.message}`); flareLib = await import(`${CDN}/flare_web.js`); } URL.revokeObjectURL(blobUrl); - + T('flare_web.js imported, calling default init'); await flareLib.default(wasmUrl); + T('WASM init complete'); log('Flare WASM initialized.', 'success'); + } else { + T('flareLib already loaded (cached)'); } - log(`Downloading GGUF model: ${config.url.split('/').pop()}`, 'info'); const t0 = performance.now(); - const resp = await fetch(config.url); - if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching model`); - const contentLength = parseInt(resp.headers.get('content-length') || '0', 10); - const reader = resp.body.getReader(); - const chunks = []; - let received = 0; - - while (true) { - const { done, value } = await reader.read(); - if (done) break; - chunks.push(value); - received += value.length; - if (contentLength > 0) { - updateProgress(Math.round((received / contentLength) * 100)); - } + // OPFS caching: on first run the GGUF streams directly to disk; + // subsequent runs skip the 138 MB download. Big UX win for + // iterative benchmarking on the same model. Falls back to + // in-memory fetch on browsers without OPFS (Firefox private mode). + const cacheName = 'flare-' + config.url.split('/').pop(); + let bytes = null; + let opfsDir = null; + T('probing OPFS'); + try { + opfsDir = await navigator.storage.getDirectory(); + T('OPFS available'); + } catch (e) { + T(`OPFS unavailable: ${e}`); } - const bytes = new Uint8Array(received); - let offset = 0; - for (const chunk of chunks) { - bytes.set(chunk, offset); - offset += chunk.length; - } + if (opfsDir) { + let fileHandle = null; + let cached = false; + try { + fileHandle = await opfsDir.getFileHandle(cacheName, { create: false }); + const info = await fileHandle.getFile(); + T(`OPFS cache file exists, size=${info.size}`); + if (info.size > 100 * 1024 * 1024) { + cached = true; + log(`Using cached GGUF from OPFS (${(info.size / 1024 / 1024).toFixed(1)} MB)`, 'success'); + updateProgress(100); + bytes = new Uint8Array(await info.arrayBuffer()); + } + } catch (e) { + T(`no OPFS cache hit: ${e.message || e}`); + } - log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB, parsing GGUF...`, 'info'); + if (!cached) { + log(`Downloading GGUF model: ${config.url.split('/').pop()}`, 'info'); + fileHandle = await opfsDir.getFileHandle(cacheName, { create: true }); + const writable = await fileHandle.createWritable(); + try { + const resp = await fetch(config.url); + if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching model`); + const contentLength = parseInt(resp.headers.get('content-length') || '0', 10); + const reader = resp.body.getReader(); + const chunks = []; + let received = 0; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + await writable.write(value); + chunks.push(value); + received += value.length; + if (contentLength > 0) { + updateProgress(Math.round((received / contentLength) * 100)); + } + } + await writable.close(); + T(`downloaded ${(received / 1024 / 1024).toFixed(1)} MB to OPFS`); + bytes = new Uint8Array(received); + let off = 0; + for (const c of chunks) { bytes.set(c, off); off += c.length; } + chunks.length = 0; + log(`Downloaded ${(received / 1024 / 1024).toFixed(1)} MB`, 'info'); + } catch (err) { + try { await writable.abort(); } catch {} + throw err; + } + } + } else { + // OPFS fallback: regular in-memory download. + log(`Downloading GGUF model: ${config.url.split('/').pop()}`, 'info'); + const resp = await fetch(config.url); + if (!resp.ok) throw new Error(`HTTP ${resp.status} fetching model`); + const contentLength = parseInt(resp.headers.get('content-length') || '0', 10); + const reader = resp.body.getReader(); + const chunks = []; + let received = 0; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + chunks.push(value); + received += value.length; + if (contentLength > 0) updateProgress(Math.round((received / contentLength) * 100)); + } + bytes = new Uint8Array(received); + let offset = 0; + for (const c of chunks) { bytes.set(c, offset); offset += c.length; } + chunks.length = 0; + } + log(`Parsing ${(bytes.length / 1024 / 1024).toFixed(1)} MB GGUF...`, 'info'); + T(`calling FlareEngine.load (bytes=${bytes.length})`); flareEngine = flareLib.FlareEngine.load(bytes); - + T('FlareEngine.load returned'); + bytes = null; // drop source buffer before warmup / GPU upload + + // GPU prefill is currently deadlocked on Chrome main-thread wasm due + // to wgpu's sync readback pattern (map_async callback can't fire + // while we're mid-sync-WASM-call). Force CPU path until the async + // readback refactor lands upstream. + const USE_GPU = new URL(location.href).searchParams.get('gpu') === '1'; + T(`init_gpu: ${USE_GPU ? 'ENABLED via ?gpu=1' : 'SKIPPED (CPU only)'}`); try { - const gpuOk = await flareEngine.init_gpu(); + const gpuOk = USE_GPU ? await flareEngine.init_gpu() : false; + T(`init_gpu complete, gpuOk=${gpuOk}`); const info = JSON.parse(flareEngine.backend_info()); console.log('[Flare] backend_info:', info); log(`Flare backend: ${JSON.stringify(info)}`, gpuOk ? 'success' : 'info'); // Enable profiling; runFlareInference reads it after the first prefill flareEngine.enable_prefill_profiling(); + T('prefill profiling enabled'); } catch (e) { console.warn('[Flare] GPU init failed:', e); log('Flare GPU init failed — using CPU SIMD', 'info'); @@ -674,20 +750,27 @@

Comparison Charts

} async function runFlareInference(prompt, opts) { + const tInf = (label) => console.log(`[flare-trace ${Math.round(performance.now())}ms] inf: ${label}`); + tInf('enter runFlareInference'); const t0 = performance.now(); + tInf('encode_text begin'); const promptIds = flareEngine.encode_text(prompt); + tInf(`encode_text done, ${promptIds?.length || 0} tokens`); if (!promptIds || promptIds.length === 0) { throw new Error('Flare tokenizer failed to encode prompt'); } + tInf('reset()'); flareEngine.reset(); + tInf(`begin_stream_with_params (maxTokens=${opts.maxTokens}) — this runs prefill`); flareEngine.begin_stream_with_params( promptIds, opts.maxTokens, opts.temperature || 0.001, 1.0, 40, 1.0, 0.0, ); + tInf('begin_stream_with_params returned (prefill done)'); const firstTokenTime = performance.now() - t0; @@ -708,12 +791,17 @@

Comparison Charts

let tokenCount = 0; let output = ''; + tInf('entering decode loop'); while (!flareEngine.stream_done) { const id = flareEngine.next_token(); if (id === undefined) break; tokenCount++; output += flareEngine.decode_token_chunk(id); + if (tokenCount === 1 || tokenCount === 5 || tokenCount % 32 === 0) { + tInf(`decoded ${tokenCount} tokens`); + } } + tInf(`decode loop exit: ${tokenCount} tokens, stream_done=${flareEngine.stream_done}`); const totalTime = performance.now() - t0; const decodeTime = totalTime - firstTokenTime;