From d3c6643a5c2b3a8aff747d8fe9a9f96e85320fc0 Mon Sep 17 00:00:00 2001
From: Saurav Panda <sgp65@cornell.edu>
Date: Thu, 23 Apr 2026 16:19:07 -0700
Subject: [PATCH] Bump benchmark to flare 0.2.15 + use next_token_async for GPU
 decode

---
 examples/benchmark/index.html | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)
diff --git a/examples/benchmark/index.html b/examples/benchmark/index.html
index afe0143..5d49913 100644
--- a/examples/benchmark/index.html
+++ b/examples/benchmark/index.html
@@ -597,7 +597,7 @@ <h2>Comparison Charts</h2>
         if (!flareLib) {
           log('Loading @sauravpanda/flare WASM from CDN...', 'info');
           T('fetching flare_web.js from CDN');
-          const CDN = 'https://cdn.jsdelivr.net/npm/@sauravpanda/flare@0.2.13/pkg';
+          const CDN = 'https://cdn.jsdelivr.net/npm/@sauravpanda/flare@0.2.15/pkg';
           const wasmUrl = `${CDN}/flare_web_bg.wasm`;
 
           const jsResp = await fetch(`${CDN}/flare_web.js`, { cache: 'no-cache' });
@@ -724,12 +724,12 @@ <h2>Comparison Charts</h2>
         T('FlareEngine.load returned');
         bytes = null; // drop source buffer before warmup / GPU upload
 
-        // GPU prefill is currently deadlocked on Chrome main-thread wasm due
-        // to wgpu's sync readback pattern (map_async callback can't fire
-        // while we're mid-sync-WASM-call).  Force CPU path until the async
-        // readback refactor lands upstream.
-        const USE_GPU = new URL(location.href).searchParams.get('gpu') === '1';
-        T(`init_gpu: ${USE_GPU ? 'ENABLED via ?gpu=1' : 'SKIPPED (CPU only)'}`);
+        // GPU decode now works via `next_token_async` (flare-web 0.2.14).
+        // Prefill still runs sync → uses the CPU backend's Q8_0 SIMD path;
+        // decode reads back through the async readback so the WebGPU
+        // `map_async` callback can fire.  `?gpu=0` opts out for debugging.
+        const USE_GPU = new URL(location.href).searchParams.get('gpu') !== '0';
+        T(`init_gpu: ${USE_GPU ? 'ENABLED (async decode)' : 'SKIPPED via ?gpu=0'}`);
         try {
           const gpuOk = USE_GPU ? await flareEngine.init_gpu() : false;
           T(`init_gpu complete, gpuOk=${gpuOk}`);
@@ -792,8 +792,12 @@ <h2>Comparison Charts</h2>
         let output = '';
 
         tInf('entering decode loop');
+        // Prefer the async variant when available (flare-web >= 0.2.14) so
+        // WebGPU's map_async callback can fire between tokens.  Sync
+        // next_token is only safe on CPU backend.
+        const hasAsync = typeof flareEngine.next_token_async === 'function';
         while (!flareEngine.stream_done) {
-          const id = flareEngine.next_token();
+          const id = hasAsync ? await flareEngine.next_token_async() : flareEngine.next_token();
           if (id === undefined) break;
           tokenCount++;
           output += flareEngine.decode_token_chunk(id);