From 38add9d0b28874aa49cd03ea3b76ee1d40efecb2 Mon Sep 17 00:00:00 2001
From: Caio Pizzol <caiopizzol@icloud.com>
Date: Mon, 8 Jun 2026 17:14:21 -0300
Subject: [PATCH 1/4] refactor(corpus): improve compare scoring models

---
 README.md                             |   5 +-
 packages/fallbacks/README.md          |   3 +-
 packages/fallbacks/compare.test.ts    | 141 ++++++++++++++++
 packages/fallbacks/scripts/compare.ts | 225 ++++++++++++++++++++++----
 4 files changed, 343 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index 078db4b..0bfe00e 100644
--- a/README.md
+++ b/README.md
@@ -22,8 +22,9 @@ Built by the team behind [SuperDoc](https://github.com/superdoc-dev/superdoc). S
 - Runtime: install `@docfonts/fallbacks` and call the lookup helpers.
 - Acquire: run `bun run --cwd packages/fallbacks acquire` to download reviewed open-font source
   archives into an ignored local cache and write local hash snapshots.
-- Compare: planned local tooling. Results should stay local unless deliberately published through a
-  curated product surface.
+- Compare: run `bun run --cwd packages/fallbacks compare` to rank acquired open fonts against a
+  licensed local reference. Results stay local unless deliberately published through a curated
+  product surface.
 
 ## API
 
diff --git a/packages/fallbacks/README.md b/packages/fallbacks/README.md
index 28f88d4..0553400 100644
--- a/packages/fallbacks/README.md
+++ b/packages/fallbacks/README.md
@@ -125,8 +125,9 @@ bun run --cwd packages/fallbacks compare -- \
 - `--reference` (required) - path to the font to measure against.
 - `--family` - a label shown in the report header.
 - `--source` - restrict to one or more acquired source ids (repeat the flag or comma-separate). Defaults to every acquired source.
+- `--model` - `latin` by default. Use `monospace` for mono references so matching cells report `cell_width_only`, not `metric_safe`.
 
-The comparison is a lead finder, not an automatic verdict. It measures Latin advance widths over a fixed sample and reports the tier, coverage, outlier counts, and worst glyphs for each candidate.
+The comparison is a lead finder, not an automatic verdict. For proportional Latin fonts, tier, mean, and max use a text-carrying subset while outlier counts and worst glyphs still use the full Latin sample.
 
 ## Provenance
 
diff --git a/packages/fallbacks/compare.test.ts b/packages/fallbacks/compare.test.ts
index a08a832..e67c4c8 100644
--- a/packages/fallbacks/compare.test.ts
+++ b/packages/fallbacks/compare.test.ts
@@ -8,6 +8,8 @@ import {
   collectCandidates,
   type FontMetrics,
   LATIN_SAMPLE,
+  LATIN_TEXT_SAMPLE,
+  parseArgs,
   parseFont,
   renderReport,
   type SnapshotSource,
@@ -190,6 +192,29 @@ describe("LATIN_SAMPLE", () => {
   });
 });
 
+describe("LATIN_TEXT_SAMPLE", () => {
+  test("keeps text carriers and excludes symbol outliers", () => {
+    expect(LATIN_TEXT_SAMPLE).toContain(0x41); // 'A'
+    expect(LATIN_TEXT_SAMPLE).toContain(0x39); // '9'
+    expect(LATIN_TEXT_SAMPLE).toContain(0x00e9); // e with acute
+    expect(LATIN_TEXT_SAMPLE).toContain(0x00a0); // no-break space
+    expect(LATIN_TEXT_SAMPLE).toContain(0x2014); // em dash codepoint
+    expect(LATIN_TEXT_SAMPLE).not.toContain(0x00af); // macron
+    expect(LATIN_TEXT_SAMPLE).not.toContain(0x00b5); // micro sign
+    expect(LATIN_TEXT_SAMPLE).not.toContain(0x00b7); // middle dot
+    expect(LATIN_TEXT_SAMPLE).not.toContain(0x00b1); // plus-minus sign
+  });
+
+  test("is a sorted subset of the full Latin sample", () => {
+    const full = new Set(LATIN_SAMPLE);
+    expect(LATIN_TEXT_SAMPLE.every((cp) => full.has(cp))).toBe(true);
+    expect(new Set(LATIN_TEXT_SAMPLE).size).toBe(LATIN_TEXT_SAMPLE.length);
+    expect([...LATIN_TEXT_SAMPLE]).toEqual(
+      [...LATIN_TEXT_SAMPLE].sort((a, b) => a - b),
+    );
+  });
+});
+
 // --- Tiers ------------------------------------------------------------------
 
 describe("classifyTier", () => {
@@ -202,6 +227,18 @@ describe("classifyTier", () => {
     expect(classifyTier(0.01, 0.026)).toBe("visual_only");
     expect(classifyTier(0, 0)).toBe("metric_safe");
   });
+
+  test("monospace model collapses the metric bands to cell_width_only", () => {
+    // What the latin model calls metric_safe or near_metric is only proof of cell width here.
+    expect(classifyTier(0, 0, "monospace")).toBe("cell_width_only");
+    expect(classifyTier(0.005, 0.01, "monospace")).toBe("cell_width_only");
+    expect(classifyTier(0.01, 0.025, "monospace")).toBe("cell_width_only");
+    // Non-matching candidates stay visual_only under both models.
+    expect(classifyTier(0.0101, 0.025, "monospace")).toBe("visual_only");
+    // The latin model is the default and is unchanged.
+    expect(classifyTier(0, 0, "latin")).toBe("metric_safe");
+    expect(classifyTier(0, 0)).toBe("metric_safe");
+  });
 });
 
 // --- Scoring ----------------------------------------------------------------
@@ -260,6 +297,58 @@ describe("scoreAdvances", () => {
     expect(Number.isNaN(score.meanDelta)).toBe(true);
     expect(Number.isNaN(score.maxDelta)).toBe(true);
   });
+
+  test("monospace model downgrades a matching candidate to cell_width_only", () => {
+    const reference = new Map([
+      [0x41, 0.6],
+      [0x42, 0.6],
+      [0x43, 0.6],
+    ]);
+    const matching = new Map([
+      [0x41, 0.6],
+      [0x42, 0.6],
+      [0x43, 0.6],
+    ]);
+    const diverging = new Map([
+      [0x41, 0.6],
+      [0x42, 0.7],
+      [0x43, 0.8],
+    ]);
+    // A matching cell is metric_safe under latin but only cell_width_only under monospace.
+    expect(scoreAdvances(reference, matching, sample).tier).toBe("metric_safe");
+    expect(
+      scoreAdvances(reference, matching, sample, 3, "monospace").tier,
+    ).toBe("cell_width_only");
+    // A non-matching candidate stays visual_only under either model.
+    expect(
+      scoreAdvances(reference, diverging, sample, 3, "monospace").tier,
+    ).toBe("visual_only");
+  });
+
+  test("can rank on text carriers while reporting full-sample outliers", () => {
+    const reportSample = [0x41, 0x00af, 0x00b5, 0x00b7];
+    const tierSample = [0x41];
+    const reference = new Map(reportSample.map((cp) => [cp, 0.5]));
+    const candidate = new Map([
+      [0x41, 0.5],
+      [0x00af, 0.33],
+      [0x00b5, 0.58],
+      [0x00b7, 0.42],
+    ]);
+    const score = scoreAdvances(reference, candidate, {
+      reportSample,
+      tierSample,
+    });
+    expect(score.tier).toBe("metric_safe");
+    expect(score.compared).toBe(1);
+    expect(score.total).toBe(1);
+    expect(score.meanDelta).toBe(0);
+    expect(score.maxDelta).toBe(0);
+    expect(score.over2_5Percent).toBe(3);
+    expect(score.worstGlyphs.map((g) => g.codepoint)).toEqual([
+      0x00af, 0x00b7, 0x00b5,
+    ]);
+  });
 });
 
 // --- SFNT parsing -----------------------------------------------------------
@@ -360,6 +449,37 @@ describe("renderReport", () => {
     expect(lines[2]).toContain("visual_only");
   });
 
+  test("ranks cell_width_only after near_metric and before visual_only", () => {
+    const reference = new Map([[0x41, 0.6]]);
+    const sample = [0x41];
+    // mean 0, max 0 -> near_metric is impossible from a perfect match, so build a near_metric by a
+    // small delta, a cell_width_only via the monospace model, and a visual_only via a large delta.
+    const near = scoreAdvances(reference, new Map([[0x41, 0.607]]), sample);
+    const cell = scoreAdvances(
+      reference,
+      new Map([[0x41, 0.6]]),
+      sample,
+      3,
+      "monospace",
+    );
+    const visual = scoreAdvances(reference, new Map([[0x41, 0.9]]), sample);
+    expect(near.tier).toBe("near_metric");
+    expect(cell.tier).toBe("cell_width_only");
+    expect(visual.tier).toBe("visual_only");
+    const report = renderReport([
+      { sourceId: "visual-src", file: "v.otf", score: visual },
+      { sourceId: "cell-src", file: "c.otf", score: cell },
+      { sourceId: "near-src", file: "n.otf", score: near },
+    ]);
+    const lines = report.split("\n");
+    expect(lines[1]).toContain("near-src");
+    expect(lines[1]).toContain("near_metric");
+    expect(lines[2]).toContain("cell-src");
+    expect(lines[2]).toContain("cell_width_only");
+    expect(lines[3]).toContain("visual-src");
+    expect(lines[3]).toContain("visual_only");
+  });
+
   test("can limit the rendered table to the top rows", () => {
     const reference = sampleMetrics(mockFont(0.5), [0x41]);
     const close = scoreAdvances(
@@ -405,6 +525,27 @@ describe("renderReport", () => {
   });
 });
 
+// --- Argument parsing -------------------------------------------------------
+
+describe("parseArgs", () => {
+  test("defaults to the latin model", () => {
+    expect(parseArgs(["--reference", "ref.otf"]).model).toBe("latin");
+  });
+
+  test("accepts --model monospace", () => {
+    expect(parseArgs(["--model", "monospace"]).model).toBe("monospace");
+    expect(parseArgs(["--model", "latin"]).model).toBe("latin");
+  });
+
+  test("rejects an unknown model", () => {
+    expect(() => parseArgs(["--model", "serif"])).toThrow(/--model requires/);
+  });
+
+  test("rejects --model without a value", () => {
+    expect(() => parseArgs(["--model"])).toThrow(/requires a value/);
+  });
+});
+
 // --- Cached-file candidate collection ---------------------------------------
 
 describe("collectCandidates (GitHub tree sources)", () => {
diff --git a/packages/fallbacks/scripts/compare.ts b/packages/fallbacks/scripts/compare.ts
index ceca031..dd9bf7e 100644
--- a/packages/fallbacks/scripts/compare.ts
+++ b/packages/fallbacks/scripts/compare.ts
@@ -36,25 +36,96 @@ export const LATIN_SAMPLE: readonly number[] = (() => {
   return [...new Set(all)].sort((a, b) => a - b);
 })();
 
+const TEXT_PUNCTUATION = new Set([
+  0x20, // space
+  0x21, // !
+  0x22, // "
+  0x23, // #
+  0x26, // &
+  0x27, // '
+  0x28, // (
+  0x29, // )
+  0x2c, // ,
+  0x2d, // -
+  0x2e, // .
+  0x2f, // /
+  0x3a, // :
+  0x3b, // ;
+  0x3f, // ?
+  0x40, // @
+  0x5b, // [
+  0x5d, // ]
+  0x7b, // {
+  0x7d, // }
+  0x00a0, // no-break space
+  0x2013, // en dash
+  0x2014, // em dash
+  0x2018, // left single quote
+  0x2019, // right single quote
+  0x201c, // left double quote
+  0x201d, // right double quote
+  0x2026, // ellipsis
+]);
+
+const EXCLUDED_TEXT_LETTERS = new Set([
+  0x00b5, // micro sign: Unicode treats it as a letter, but it behaves like a symbol here.
+]);
+
+function isTextLetterOrDigit(codepoint: number): boolean {
+  if (EXCLUDED_TEXT_LETTERS.has(codepoint)) return false;
+  return /^[\p{L}\p{N}]$/u.test(String.fromCodePoint(codepoint));
+}
+
+/**
+ * Text-carrying Latin sample used to rank proportional-font candidates. The full sample still reports
+ * outliers, but rare symbols should not hide a strong body-text lead.
+ */
+export const LATIN_TEXT_SAMPLE: readonly number[] = LATIN_SAMPLE.filter(
+  (cp) => TEXT_PUNCTUATION.has(cp) || isTextLetterOrDigit(cp),
+);
+
 // --- Tiers ------------------------------------------------------------------
 
 /**
  * Advance-fidelity tier. Thresholds mirror the package's verdict language (see `src/types.ts`):
  * metric_safe is the DIRECT band, near_metric the LIKELY band, everything else visual_only.
+ * cell_width_only is the monospace model's verdict for a matching cell: it proves line width, not
+ * glyph-shape fidelity.
  */
-export type CompareTier = "metric_safe" | "near_metric" | "visual_only";
+export type CompareTier =
+  | "metric_safe"
+  | "near_metric"
+  | "cell_width_only"
+  | "visual_only";
+
+/**
+ * Classification model. `latin` is the default proportional comparison. `monospace` treats a matching
+ * advance as proof of cell width only, since every glyph in a monospace cell shares one advance.
+ */
+export type CompareModel = "latin" | "monospace";
 
 const TIER_RANK: Record<CompareTier, number> = {
   metric_safe: 0,
   near_metric: 1,
-  visual_only: 2,
+  cell_width_only: 2,
+  visual_only: 3,
 };
 
-/** Classify a (mean, max) advance-delta pair into a fidelity tier. Deltas are fractions of the em. */
-export function classifyTier(meanDelta: number, maxDelta: number): CompareTier {
-  if (meanDelta <= 0.005 && maxDelta <= 0.01) return "metric_safe";
-  if (meanDelta <= 0.01 && maxDelta <= 0.025) return "near_metric";
-  return "visual_only";
+/**
+ * Classify a (mean, max) advance-delta pair into a fidelity tier. Deltas are fractions of the em. Under
+ * the monospace model a matching cell only vouches for line width, so the metric bands collapse to
+ * cell_width_only while non-matching candidates stay visual_only.
+ */
+export function classifyTier(
+  meanDelta: number,
+  maxDelta: number,
+  model: CompareModel = "latin",
+): CompareTier {
+  let tier: CompareTier = "visual_only";
+  if (meanDelta <= 0.005 && maxDelta <= 0.01) tier = "metric_safe";
+  else if (meanDelta <= 0.01 && maxDelta <= 0.025) tier = "near_metric";
+  if (model === "monospace" && tier !== "visual_only") return "cell_width_only";
+  return tier;
 }
 
 // --- SFNT parsing -----------------------------------------------------------
@@ -260,33 +331,48 @@ export interface GlyphDelta {
 
 /** The advance-parity score of one candidate font against the reference, over a fixed sample. */
 export interface CompareScore {
-  /** codepoints in the sample that both fonts map. */
+  /** codepoints in the tier sample that both fonts map. */
   compared: number;
-  /** sample size. */
+  /** tier sample size. */
   total: number;
-  /** sample codepoints not mapped by both fonts. */
+  /** tier sample codepoints not mapped by both fonts. */
   missing: number;
   meanDelta: number;
   maxDelta: number;
-  /** shared sample codepoints whose advance delta exceeds the metric_safe max threshold. */
+  /** shared report-sample codepoints whose advance delta exceeds the metric_safe max threshold. */
   over1Percent: number;
-  /** shared sample codepoints whose advance delta exceeds the near_metric max threshold. */
+  /** shared report-sample codepoints whose advance delta exceeds the near_metric max threshold. */
   over2_5Percent: number;
   tier: CompareTier;
   worstGlyphs: GlyphDelta[];
 }
 
-/**
- * Score one candidate against the reference over the sample. Both inputs are normalized advance maps
- * (codepoint -> advance/unitsPerEm); only codepoints present in both are compared. Pure, so it can be
- * tested with mocked metric maps and never needs a real font.
- */
-export function scoreAdvances(
+export interface ScoreOptions {
+  /** Sample used for outlier reporting and worst-glyph display. */
+  reportSample?: readonly number[];
+  /** Sample used for tier classification and mean/max columns. Defaults to `reportSample`. */
+  tierSample?: readonly number[];
+  worstCount?: number;
+  model?: CompareModel;
+}
+
+interface MeasuredDeltas {
+  compared: number;
+  total: number;
+  missing: number;
+  meanDelta: number;
+  maxDelta: number;
+  over1Percent: number;
+  over2_5Percent: number;
+  worstGlyphs: GlyphDelta[];
+}
+
+function measureDeltas(
   reference: ReadonlyMap<number, number>,
   candidate: ReadonlyMap<number, number>,
-  sample: readonly number[] = LATIN_SAMPLE,
-  worstCount = 3,
-): CompareScore {
+  sample: readonly number[],
+  worstCount: number,
+): MeasuredDeltas {
   const deltas: GlyphDelta[] = [];
   let sum = 0;
   let max = 0;
@@ -320,12 +406,78 @@ export function scoreAdvances(
     maxDelta,
     over1Percent,
     over2_5Percent,
-    // With no shared codepoints there is nothing to vouch for: report the floor tier.
-    tier: compared === 0 ? "visual_only" : classifyTier(meanDelta, maxDelta),
     worstGlyphs,
   };
 }
 
+function normalizeScoreOptions(
+  optionsOrSample: ScoreOptions | readonly number[] | undefined,
+  worstCount: number | undefined,
+  model: CompareModel | undefined,
+): Required<ScoreOptions> {
+  if (!optionsOrSample || Array.isArray(optionsOrSample)) {
+    const reportSample = optionsOrSample ?? LATIN_SAMPLE;
+    return {
+      reportSample,
+      tierSample: reportSample,
+      worstCount: worstCount ?? 3,
+      model: model ?? "latin",
+    };
+  }
+
+  const options = optionsOrSample as ScoreOptions;
+  const reportSample = options.reportSample ?? LATIN_SAMPLE;
+  return {
+    reportSample,
+    tierSample: options.tierSample ?? reportSample,
+    worstCount: options.worstCount ?? 3,
+    model: options.model ?? "latin",
+  };
+}
+
+/**
+ * Score one candidate against the reference. The tier can use a narrower text sample while the report
+ * still surfaces full-sample outliers. Both inputs are normalized advance maps (codepoint ->
+ * advance/unitsPerEm); only codepoints present in both are compared.
+ */
+export function scoreAdvances(
+  reference: ReadonlyMap<number, number>,
+  candidate: ReadonlyMap<number, number>,
+  optionsOrSample?: ScoreOptions | readonly number[],
+  worstCount?: number,
+  model?: CompareModel,
+): CompareScore {
+  const options = normalizeScoreOptions(optionsOrSample, worstCount, model);
+  const report = measureDeltas(
+    reference,
+    candidate,
+    options.reportSample,
+    options.worstCount,
+  );
+  const tierMetrics =
+    options.tierSample === options.reportSample
+      ? report
+      : measureDeltas(reference, candidate, options.tierSample, 0);
+  return {
+    compared: tierMetrics.compared,
+    total: tierMetrics.total,
+    missing: tierMetrics.missing,
+    meanDelta: tierMetrics.meanDelta,
+    maxDelta: tierMetrics.maxDelta,
+    over1Percent: report.over1Percent,
+    over2_5Percent: report.over2_5Percent,
+    tier:
+      tierMetrics.compared === 0
+        ? "visual_only"
+        : classifyTier(
+            tierMetrics.meanDelta,
+            tierMetrics.maxDelta,
+            options.model,
+          ),
+    worstGlyphs: report.worstGlyphs,
+  };
+}
+
 /** Build a font's normalized-advance map over the sample (only codepoints it maps are included). */
 export function sampleMetrics(
   font: FontMetrics,
@@ -506,15 +658,16 @@ interface CompareRow {
   score: CompareScore;
 }
 
-interface ParsedArgs {
+export interface ParsedArgs {
   reference?: string;
   family?: string;
   limit: number | null;
   sources: string[];
+  model: CompareModel;
 }
 
-function parseArgs(argv: string[]): ParsedArgs {
-  const args: ParsedArgs = { limit: 50, sources: [] };
+export function parseArgs(argv: string[]): ParsedArgs {
+  const args: ParsedArgs = { limit: 50, sources: [], model: "latin" };
   const readValue = (flag: string, index: number): string => {
     const value = argv[index + 1];
     if (!value || value.startsWith("--"))
@@ -540,6 +693,14 @@ function parseArgs(argv: string[]): ParsedArgs {
           args.sources.push(id);
         i++;
         break;
+      case "--model": {
+        const value = readValue(flag, i);
+        if (value !== "latin" && value !== "monospace")
+          throw new Error("--model requires 'latin' or 'monospace'");
+        args.model = value;
+        i++;
+        break;
+      }
       case "--limit": {
         const value = readValue(flag, i);
         if (value === "all") {
@@ -683,7 +844,11 @@ function main(): void {
     for (const candidate of collectCandidates(source, cacheDir)) {
       try {
         const font = parseFont(candidate.bytes);
-        const score = scoreAdvances(reference, sampleMetrics(font));
+        const score = scoreAdvances(reference, sampleMetrics(font), {
+          reportSample: LATIN_SAMPLE,
+          tierSample: args.model === "latin" ? LATIN_TEXT_SAMPLE : LATIN_SAMPLE,
+          model: args.model,
+        });
         rows.push({ sourceId: source.sourceId, file: candidate.file, score });
       } catch {
         skipped++;
@@ -695,8 +860,12 @@ function main(): void {
   const shown =
     args.limit === null ? rows.length : Math.min(args.limit, rows.length);
   const skippedText = skipped === 0 ? "" : `; skipped ${skipped} unsupported`;
+  const modelText =
+    args.model === "latin"
+      ? `; tier/mean/max ${LATIN_TEXT_SAMPLE.length} text codepoints`
+      : `; model ${args.model}`;
   console.log(
-    `reference ${basename(args.reference)} as "${label}" vs ${rows.length} candidate(s) over ${LATIN_SAMPLE.length} Latin codepoints; showing ${shown}${skippedText}\n`,
+    `reference ${basename(args.reference)} as "${label}" vs ${rows.length} candidate(s) over ${LATIN_SAMPLE.length} Latin codepoints${modelText}; showing ${shown}${skippedText}\n`,
   );
   console.log(renderReport(rows, { limit: args.limit }));
 }

From e1a309f5b5020144c4ab781c5b1589251741dfd3 Mon Sep 17 00:00:00 2001
From: Caio Pizzol <caiopizzol@icloud.com>
Date: Mon, 8 Jun 2026 17:45:11 -0300
Subject: [PATCH 2/4] refactor: move corpus tooling out of fallbacks package

---
 .gitignore                                    |   1 +
 README.md                                     |  25 ++--
 package.json                                  |   4 +-
 packages/fallbacks/README.md                  | 107 +++++-------------
 packages/fallbacks/package.json               |   2 -
 packages/fallbacks/src/types.ts               |   2 +-
 tools/corpus/README.md                        |  38 +++++++
 .../corpus}/acquire.test.ts                   |   7 +-
 .../scripts => tools/corpus}/acquire.ts       |   6 +-
 .../corpus}/compare.test.ts                   |   2 +-
 .../scripts => tools/corpus}/compare.ts       |  14 +--
 tsconfig.json                                 |   2 +-
 12 files changed, 90 insertions(+), 120 deletions(-)
 create mode 100644 tools/corpus/README.md
 rename {packages/fallbacks => tools/corpus}/acquire.test.ts (98%)
 rename {packages/fallbacks/scripts => tools/corpus}/acquire.ts (99%)
 rename {packages/fallbacks => tools/corpus}/compare.test.ts (99%)
 rename {packages/fallbacks/scripts => tools/corpus}/compare.ts (98%)

diff --git a/.gitignore b/.gitignore
index 7d6ae54..12837e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,5 +7,6 @@ dev/
 .wrangler/
 .mcp.json
 mockups/
+.cache/
 packages/fallbacks/.cache/
 STATE.md
diff --git a/README.md b/README.md
index 0bfe00e..45737fc 100644
--- a/README.md
+++ b/README.md
@@ -6,32 +6,21 @@
 > Document font substitution, measured.
 
 docfonts publishes `@docfonts/fallbacks`, a small runtime package for document renderers.
-It maps common proprietary document fonts to reviewed open-font fallback decisions.
-
-The package ships no font binaries and no proprietary data. It contains a public evidence snapshot,
-asset-aware lookup helpers, and tests that prove the npm package only includes supported runtime files.
+It maps common proprietary document fonts to reviewed open-font fallback decisions. It ships no font binaries and no proprietary data.
 
 Built by the team behind [SuperDoc](https://github.com/superdoc-dev/superdoc). Standalone and neutral.
 
-## Package
+## Structure
 
 - `packages/fallbacks` - runtime fallback decisions and lookup helpers.
+- `tools/corpus` - local source acquisition and comparison tools.
 
-## Workflows
+## Use
 
 - Runtime: install `@docfonts/fallbacks` and call the lookup helpers.
-- Acquire: run `bun run --cwd packages/fallbacks acquire` to download reviewed open-font source
-  archives into an ignored local cache and write local hash snapshots.
-- Compare: run `bun run --cwd packages/fallbacks compare` to rank acquired open fonts against a
-  licensed local reference. Results stay local unless deliberately published through a curated
-  product surface.
-
-## API
-
-- `getRenderableFallback` - returns the open family to render, or `null` when none is renderable.
-- `getFallbackDecision` - explains the outcome for UI, diagnostics, and reporting.
-- `createFallbackMap` - builds a resolver map from only the font families you can render.
-- `normalizeFamilyName` - normalizes map lookup keys.
+- Acquire: run `bun run corpus:acquire` to download open-font sources into an ignored local cache.
+- Compare: run `bun run corpus:compare` to rank acquired open fonts against a licensed local
+  reference. Results stay local unless deliberately published through a curated product surface.
 
 ## Install
 
diff --git a/package.json b/package.json
index 4b98d59..584935a 100644
--- a/package.json
+++ b/package.json
@@ -11,7 +11,9 @@
     "typecheck": "tsc --noEmit",
     "lint": "biome check .",
     "format": "biome check --write .",
-    "test": "bun test packages/fallbacks",
+    "test": "bun test packages/fallbacks tools/corpus",
+    "corpus:acquire": "bun run tools/corpus/acquire.ts",
+    "corpus:compare": "bun run tools/corpus/compare.ts",
     "check": "bun run typecheck && bun run test && bun run lint && bun run build",
     "check:fast": "bun run typecheck && bun run lint",
     "prepare": "if [ -z \"$CI\" ]; then bunx lefthook install; fi"
diff --git a/packages/fallbacks/README.md b/packages/fallbacks/README.md
index 0553400..25d4502 100644
--- a/packages/fallbacks/README.md
+++ b/packages/fallbacks/README.md
@@ -1,10 +1,8 @@
 # @docfonts/fallbacks
 
-Document font substitution, measured.
+Measured open-font fallbacks for proprietary document fonts.
 
-Measured open-font fallbacks for proprietary document fonts. Use it to decide whether a requested document font can render with an open family you actually ship.
-
-It ships no fonts and no proprietary binaries. It ships decisions: the recommended open family when one exists, the fidelity verdict, and the honest cases where no open family should be used.
+The package ships decisions, not fonts: which open family to render when one is reviewed, how faithful it is, and when no open fallback should be used.
 
 ## Install
 
@@ -12,11 +10,11 @@ It ships no fonts and no proprietary binaries. It ships decisions: the recommend
 npm install @docfonts/fallbacks
 ```
 
-ESM-only. Use `import`, or let your bundler handle it. CommonJS `require()` is not supported.
+ESM-only.
 
-## Render A Font
+## Render a font
 
-Use `getRenderableFallback` when you need one font family to render now. Pass `canRenderFamily` so docfonts only returns families your app can load.
+Use `getRenderableFallback` when you need one family to render now. Pass `canRenderFamily` so the result only includes fonts your app can load.
 
 ```ts
 import { getRenderableFallback } from "@docfonts/fallbacks";
@@ -24,15 +22,13 @@ import { getRenderableFallback } from "@docfonts/fallbacks";
 const fallback = getRenderableFallback("Helvetica", {
   canRenderFamily: (family) => bundledFamilies.has(family),
 });
-
-// { substituteFamily: "Liberation Sans", policyAction: "substitute", verdict: "metric_safe", lineBreakSafe: true, evidenceId: "helvetica", generic: "sans-serif" }
 ```
 
-The result is `null` when there is nothing renderable from your available assets. Use `getFallbackDecision` when you need to know why.
+Returns `null` when docfonts has no renderable fallback from your available assets.
 
-## Explain A Decision
+## Explain a decision
 
-Use `getFallbackDecision` for UI, diagnostics, and reporting. It distinguishes known fonts with no recommended fallback from fonts docfonts has never seen.
+Use `getFallbackDecision` for UI, diagnostics, and reports.
 
 ```ts
 import { getFallbackDecision } from "@docfonts/fallbacks";
@@ -40,31 +36,23 @@ import { getFallbackDecision } from "@docfonts/fallbacks";
 getFallbackDecision("Aptos");
 // { kind: "customer_supplied", evidenceId: "aptos", generic: "sans-serif" }
 
-getFallbackDecision("Tahoma");
-// { kind: "no_recommended_fallback", evidenceId: "tahoma", generic: "sans-serif" }
-
-getFallbackDecision("Made Up Font");
-// { kind: "unknown" }
-
 getFallbackDecision("Georgia", {
   canRenderFamily: (family) => bundledFamilies.has(family),
 });
-// { kind: "asset_missing", substituteFamily: "Gelasio", verdict: "near_metric", evidenceId: "georgia", generic: "serif" }
+// { kind: "asset_missing", substituteFamily: "Gelasio", verdict: "near_metric", ... }
 ```
 
-Decision kinds:
+Important decision kinds:
 
-- `fallback` - render the returned `substituteFamily`.
-- `asset_missing` - docfonts has a fallback, but your app does not load that family.
-- `face_missing` - (face-aware lookups only) the family has a substitute, but not for the requested face. Route that face through your absence handling; do not substitute it.
-- `no_recommended_fallback` - docfonts knows the font but recommends no renderable open family.
-- `customer_supplied` - the real font should come from the customer or environment.
-- `preserve_only` - keep the original family name. Do not substitute.
+- `fallback` - render `fallback.substituteFamily`.
+- `asset_missing` - docfonts has a fallback, but your app does not load it.
+- `face_missing` - the fallback does not provide the requested face.
+- `customer_supplied`, `preserve_only`, or `no_recommended_fallback` - do not substitute.
 - `unknown` - docfonts has no evidence for this family.
 
-## Create A Resolver Map
+## Build a resolver map
 
-Use `createFallbackMap` when wiring a resolver. `canRenderFamily` is required because a resolver map must never point at fonts you cannot load.
+Use `createFallbackMap` when wiring a resolver. `canRenderFamily` is required so the map never points at fonts you cannot load.
 
 ```ts
 import { createFallbackMap, normalizeFamilyName } from "@docfonts/fallbacks";
@@ -73,64 +61,21 @@ const map = createFallbackMap({
   canRenderFamily: (family) => bundledFamilies.has(family),
 });
 
-map[normalizeFamilyName("Times New Roman")]; // { substituteFamily: "Liberation Serif", ... }
-```
-
-Keys are normalized. Use `normalizeFamilyName` for lookups. Rows whose substitute family is not available are omitted. Each entry carries `faces`: a Regular-only entry is only safe in a **face-aware** resolver (one that checks `faces` or uses `getRenderableFallbackForFace`), since applying it to bold/italic would route a face the substitute does not provide.
-
-## What the fields mean
-
-- `substituteFamily` - the open family to render in place of the requested one.
-- `policyAction` - what a renderer should do, not a quality claim. Use `verdict` for fidelity.
-- `verdict` - the measured fidelity. Examples: `metric_safe`, `near_metric`, `cell_width_only`, `visual_only`.
-- `lineBreakSafe` - true when advances preserve line breaks: `metric_safe`, `near_metric`, or monospace `cell_width_only`.
-- `faces` - reviewed face coverage for this evidence row. If any face is `true`, respect it as face-scoped coverage (a row can be Regular-only). If all faces are `false`, the row is **not** face-scoped (e.g. a category fallback whose physical font does have faces) and the face-aware helpers treat it as renderable for any face.
-- `evidenceId` - the stable id for the reviewed evidence row; look the full row up in `SUBSTITUTION_EVIDENCE`.
-- `generic` - the logical font's broad CSS category (`serif`, `sans-serif`, or `monospace`), for a last-resort generic `font-family` keyword when no named substitute renders. Also present on the known (non-`unknown`) decision kinds.
-- `glyphExceptions` - named glyph-level divergences that qualify this fallback (e.g. one codepoint reflows), or omitted when none. A family lookup carries all of the row's; a face lookup (`getRenderableFallbackForFace`) carries only that face's, so Cambria Regular shows none while Bold Italic shows its grave-accent exception.
-
-`cell_width_only` keeps monospace advances stable, but glyph shapes can still differ. A `substitute` can still have a lower-fidelity `verdict` when one face or glyph is qualified. The verdict is the fidelity signal.
-
-## Face-aware routing (Regular-only substitutes)
-
-Some substitutes provide only some faces - e.g. Baskerville Old Face -> Bacasime Antique is Regular-only. The family-level helpers above answer "which family", and every result carries `faces`, so a resolver must route per-face. The face-aware helpers do it for you:
-
-```ts
-import { getRenderableFallbackForFace } from "@docfonts/fallbacks";
-const opts = { canRenderFamily: (family) => bundledFamilies.has(family) };
-
-getRenderableFallbackForFace("Baskerville Old Face", "regular", opts)?.substituteFamily; // "Bacasime Antique"
-getRenderableFallbackForFace("Baskerville Old Face", "bold", opts);                       // null (Regular-only)
+map[normalizeFamilyName("Times New Roman")];
 ```
 
-`getFallbackDecisionForFace(family, face, options)` reports the reason - `face_missing` when the substitute exists but lacks that face. A covered face carries its OWN verdict, not the family's worst-face rollup (e.g. `Cambria` regular is `metric_safe` even though the family rolls up to `visual_only`).
+Some fallbacks are face-scoped. Use `getRenderableFallbackForFace`, or respect the returned `faces` field before applying a fallback to bold or italic text.
 
-The full structured rows are exported as `SUBSTITUTION_EVIDENCE` for richer reporting (faces, per-face verdicts, glyph exceptions).
+## Fidelity fields
 
-## Local tools
+- `verdict` - measured fidelity, such as `metric_safe`, `near_metric`, `cell_width_only`, or `visual_only`.
+- `lineBreakSafe` - true when advances preserve line breaks.
+- `glyphExceptions` - named glyphs that can reflow.
+- `generic` - CSS generic family for last-resort fallback.
+- `evidenceId` - stable id for the reviewed evidence row.
 
-These maintainer tools use ignored `.cache` files and are not shipped in the package.
-
-`bun run acquire` downloads open-font candidates into `.cache/sources`. Sources come in two shapes: release archives (zip or tar.gz) and pinned source trees. Set `DOCFONTS_SOURCE_CACHE` to use another cache directory, or pass `--source google-fonts` to acquire one source.
-
-`bun run compare` checks a private reference font against acquired OTF/TTF candidates and prints a ranked Latin advance-width table. It writes no fonts, paths, or results to the tree.
-
-```sh
-bun run --cwd packages/fallbacks compare -- \
-  --reference /path/to/reference.ttf \
-  --family "Bookman Old Style" \
-  --source tex-gyre-bonum
-```
-
-- `--reference` (required) - path to the font to measure against.
-- `--family` - a label shown in the report header.
-- `--source` - restrict to one or more acquired source ids (repeat the flag or comma-separate). Defaults to every acquired source.
-- `--model` - `latin` by default. Use `monospace` for mono references so matching cells report `cell_width_only`, not `metric_safe`.
-
-The comparison is a lead finder, not an automatic verdict. For proportional Latin fonts, tier, mean, and max use a text-carrying subset while outlier counts and worst glyphs still use the full Latin sample.
+`SUBSTITUTION_EVIDENCE` exposes the full reviewed rows for richer reporting.
 
 ## Provenance
 
-The data comes from reviewed docfonts evidence. Measurements are produced against licensed originals, but this package distributes no proprietary binaries or raw proprietary metrics.
-
-Built by the team behind SuperDoc. Standalone and neutral.
+Measurements are produced against licensed originals. This package distributes no proprietary binaries, raw proprietary metrics, or font files.
diff --git a/packages/fallbacks/package.json b/packages/fallbacks/package.json
index f38938e..6b0fca2 100644
--- a/packages/fallbacks/package.json
+++ b/packages/fallbacks/package.json
@@ -36,8 +36,6 @@
   },
   "scripts": {
     "gen:data": "bun run scripts/generate-data.ts",
-    "acquire": "bun run scripts/acquire.ts",
-    "compare": "bun run scripts/compare.ts",
     "build": "tsc -p tsconfig.build.json",
     "prepack": "bun run build"
   },
diff --git a/packages/fallbacks/src/types.ts b/packages/fallbacks/src/types.ts
index d2407af..aa1150d 100644
--- a/packages/fallbacks/src/types.ts
+++ b/packages/fallbacks/src/types.ts
@@ -33,7 +33,7 @@ export type FaceSlot = "regular" | "bold" | "italic" | "boldItalic";
  */
 export type CssGeneric = "serif" | "sans-serif" | "monospace";
 
-/** Advance-width divergence vs the proprietary oracle, as fractions (0 = identical advances). */
+/** Advance-width divergence vs the licensed reference font, as fractions (0 = identical advances). */
 export interface AdvanceDelta {
   meanDelta: number;
   /** the worst-case delta, not the mean, is what gates line-break fidelity. */
diff --git a/tools/corpus/README.md b/tools/corpus/README.md
new file mode 100644
index 0000000..8c9e4ff
--- /dev/null
+++ b/tools/corpus/README.md
@@ -0,0 +1,38 @@
+# Corpus Tools
+
+Local tools for finding open-font fallback candidates.
+
+They download fonts into an ignored cache, compare a licensed local reference against that cache, and print ranked leads. They do not publish fallback decisions.
+
+## Commands
+
+```sh
+bun run corpus:acquire
+bun run corpus:compare -- --reference /path/to/reference.ttf --family "Verdana"
+```
+
+## Acquire
+
+```sh
+bun run corpus:acquire -- --source google-fonts
+```
+
+Without `--source`, all configured sources are acquired. Use `DOCFONTS_SOURCE_CACHE` to choose a cache directory. The default is `.cache/corpus`.
+
+## Compare
+
+```sh
+bun run corpus:compare -- \
+  --reference /path/to/reference.ttf \
+  --family "Lucida Console" \
+  --source dejavu,noto-sans-mono \
+  --model monospace
+```
+
+- `--reference` is required.
+- `--family` is a report label.
+- `--source` limits the acquired sources to compare. Without it, every acquired source is used.
+- `--model latin` is the default. Proportional Latin ranking uses text-carrying codepoints for tier, mean, and max while still reporting full Latin outliers.
+- `--model monospace` reports matching mono cells as `cell_width_only`, not `metric_safe`.
+
+Comparison output is a lead finder. A public fallback row still needs review, provenance, face-scope checks, and visual sanity.
diff --git a/packages/fallbacks/acquire.test.ts b/tools/corpus/acquire.test.ts
similarity index 98%
rename from packages/fallbacks/acquire.test.ts
rename to tools/corpus/acquire.test.ts
index 288c1bd..5d47cec 100644
--- a/packages/fallbacks/acquire.test.ts
+++ b/tools/corpus/acquire.test.ts
@@ -5,7 +5,7 @@ import {
   collectGitHubTreeFonts,
   type GitHubTreeEntry,
   SOURCE_RELEASES,
-} from "./scripts/acquire";
+} from "./acquire";
 
 const joined = (...parts: string[]) => parts.join("");
 
@@ -213,10 +213,7 @@ describe("source acquisition catalog", () => {
   });
 
   test("does not include private paths or measurement environment details", () => {
-    const script = readFileSync(
-      join(import.meta.dir, "scripts", "acquire.ts"),
-      "utf8",
-    );
+    const script = readFileSync(join(import.meta.dir, "acquire.ts"), "utf8");
     for (const needle of [
       joined("/", "Users", "/"),
       joined("/", "Applications", "/"),
diff --git a/packages/fallbacks/scripts/acquire.ts b/tools/corpus/acquire.ts
similarity index 99%
rename from packages/fallbacks/scripts/acquire.ts
rename to tools/corpus/acquire.ts
index 14c5412..a6ff5c5 100644
--- a/packages/fallbacks/scripts/acquire.ts
+++ b/tools/corpus/acquire.ts
@@ -551,8 +551,8 @@ interface GitHubTreeSnapshot extends BaseSnapshot {
 
 type SourceSnapshot = ArchiveSnapshot | GitHubTreeSnapshot;
 
-const PKG_DIR = join(import.meta.dir, "..");
-const DEFAULT_CACHE_DIR = join(PKG_DIR, ".cache", "sources");
+const REPO_DIR = join(import.meta.dir, "..", "..");
+const DEFAULT_CACHE_DIR = join(REPO_DIR, ".cache", "corpus");
 const FONT_EXTENSIONS = [".otf", ".ttf", ".otc", ".ttc", ".woff2", ".woff"];
 
 const sha256 = (bytes: Uint8Array): string =>
@@ -920,7 +920,7 @@ async function main(): Promise<void> {
   const outPath = join(cacheDir, "source-snapshot.json");
   writeFileSync(
     outPath,
-    `${JSON.stringify({ generatedBy: "scripts/acquire.ts", snapshots }, null, 2)}\n`,
+    `${JSON.stringify({ generatedBy: "tools/corpus/acquire.ts", snapshots }, null, 2)}\n`,
   );
   console.log(`wrote ${outPath}`);
 }
diff --git a/packages/fallbacks/compare.test.ts b/tools/corpus/compare.test.ts
similarity index 99%
rename from packages/fallbacks/compare.test.ts
rename to tools/corpus/compare.test.ts
index e67c4c8..f6a6eb8 100644
--- a/packages/fallbacks/compare.test.ts
+++ b/tools/corpus/compare.test.ts
@@ -15,7 +15,7 @@ import {
   type SnapshotSource,
   sampleMetrics,
   scoreAdvances,
-} from "./scripts/compare";
+} from "./compare";
 
 // --- Synthetic SFNT builder -------------------------------------------------
 //
diff --git a/packages/fallbacks/scripts/compare.ts b/tools/corpus/compare.ts
similarity index 98%
rename from packages/fallbacks/scripts/compare.ts
rename to tools/corpus/compare.ts
index dd9bf7e..59a6c07 100644
--- a/packages/fallbacks/scripts/compare.ts
+++ b/tools/corpus/compare.ts
@@ -6,8 +6,8 @@ import { execFileSync } from "node:child_process";
 import { existsSync, readFileSync } from "node:fs";
 import { basename, join } from "node:path";
 
-const PKG_DIR = join(import.meta.dir, "..");
-const DEFAULT_CACHE_DIR = join(PKG_DIR, ".cache", "sources");
+const REPO_DIR = join(import.meta.dir, "..", "..");
+const DEFAULT_CACHE_DIR = join(REPO_DIR, ".cache", "corpus");
 const SNAPSHOT_FILE = "source-snapshot.json";
 const RAW_SFNT_EXTENSIONS = [".otf", ".ttf"];
 
@@ -583,12 +583,12 @@ function readArchiveMember(
 function loadSnapshot(cacheDir: string): SnapshotSource[] {
   if (!existsSync(cacheDir))
     throw new Error(
-      `source cache not found at ${cacheDir}. Run \`bun run acquire\` first.`,
+      `source cache not found at ${cacheDir}. Run \`bun run corpus:acquire\` first.`,
     );
   const snapshotPath = join(cacheDir, SNAPSHOT_FILE);
   if (!existsSync(snapshotPath))
     throw new Error(
-      `${SNAPSHOT_FILE} not found in ${cacheDir}. Run \`bun run acquire\` first.`,
+      `${SNAPSHOT_FILE} not found in ${cacheDir}. Run \`bun run corpus:acquire\` first.`,
     );
   const parsed = JSON.parse(readFileSync(snapshotPath, "utf8")) as {
     snapshots?: SnapshotSource[];
@@ -602,7 +602,7 @@ function loadSnapshot(cacheDir: string): SnapshotSource[] {
 /**
  * Collect the candidate fonts for one source from the cache. GitHub tree sources read each snapshot file
  * entry directly; archive sources list and extract font members from the cached release archive. Throws
- * when an expected cache file is absent so the caller can point the user back at `bun run acquire`.
+ * when an expected cache file is absent so the caller can point the user back at `bun run corpus:acquire`.
  */
 export function collectCandidates(
   source: SnapshotSource,
@@ -616,7 +616,7 @@ export function collectCandidates(
       const filePath = join(cacheDir, entry.path);
       if (!existsSync(filePath))
         throw new Error(
-          `candidate file missing for ${source.sourceId}: ${filePath}. Run \`bun run acquire\` first.`,
+          `candidate file missing for ${source.sourceId}: ${filePath}. Run \`bun run corpus:acquire\` first.`,
         );
       return { file: entry.name, bytes: readFileSync(filePath) };
     });
@@ -629,7 +629,7 @@ export function collectCandidates(
   );
   if (!existsSync(archivePath))
     throw new Error(
-      `candidate archive missing for ${source.sourceId}: ${archivePath}. Run \`bun run acquire\` first.`,
+      `candidate archive missing for ${source.sourceId}: ${archivePath}. Run \`bun run corpus:acquire\` first.`,
     );
   const members = listFontMembers(archivePath, format);
   if (members.length === 0)
diff --git a/tsconfig.json b/tsconfig.json
index 2e8aaec..f82275f 100644
--- a/tsconfig.json
+++ b/tsconfig.json
@@ -13,6 +13,6 @@
       "@docfonts/fallbacks": ["./packages/fallbacks/src/index.ts"]
     }
   },
-  "include": ["packages/fallbacks/**/*.ts"],
+  "include": ["packages/fallbacks/**/*.ts", "tools/**/*.ts"],
   "exclude": ["node_modules", "dist", "**/dist"]
 }

From 2a68da0425aa13cae8cbc1b3b5b54a41ca0175ce Mon Sep 17 00:00:00 2001
From: Caio Pizzol <caiopizzol@icloud.com>
Date: Mon, 8 Jun 2026 17:57:31 -0300
Subject: [PATCH 3/4] refactor: split corpus compare tool

---
 tools/corpus/compare.ts     | 842 ++++--------------------------------
 tools/corpus/src/cache.ts   | 171 ++++++++
 tools/corpus/src/font.ts    | 203 +++++++++
 tools/corpus/src/report.ts  |  83 ++++
 tools/corpus/src/samples.ts |  70 +++
 tools/corpus/src/score.ts   | 157 +++++++
 tools/corpus/src/tiers.ts   |  41 ++
 7 files changed, 809 insertions(+), 758 deletions(-)
 create mode 100644 tools/corpus/src/cache.ts
 create mode 100644 tools/corpus/src/font.ts
 create mode 100644 tools/corpus/src/report.ts
 create mode 100644 tools/corpus/src/samples.ts
 create mode 100644 tools/corpus/src/score.ts
 create mode 100644 tools/corpus/src/tiers.ts

diff --git a/tools/corpus/compare.ts b/tools/corpus/compare.ts
index 59a6c07..c5a4b30 100644
--- a/tools/corpus/compare.ts
+++ b/tools/corpus/compare.ts
@@ -1,656 +1,46 @@
 /**
- * Local maintainer tool: compare a private reference font with acquired open-font archives.
+ * Local maintainer tool: compare a licensed reference font with acquired open-font archives.
  * Reads ignored cache files, prints to stdout, and writes nothing to the tree.
  */
-import { execFileSync } from "node:child_process";
 import { existsSync, readFileSync } from "node:fs";
 import { basename, join } from "node:path";
+import {
+  archiveFormatOf,
+  collectCandidates,
+  loadSnapshot,
+  requireArchiveTool,
+  type SnapshotSource,
+} from "./src/cache";
+import { parseFont, sampleMetrics } from "./src/font";
+import { renderReport } from "./src/report";
+import { LATIN_SAMPLE, LATIN_TEXT_SAMPLE } from "./src/samples";
+import { type CompareScore, scoreAdvances } from "./src/score";
+import type { CompareModel } from "./src/tiers";
+
+export {
+  archiveFormatOf,
+  collectCandidates,
+  loadSnapshot,
+  requireArchiveTool,
+  type SnapshotSource,
+} from "./src/cache";
+export { type FontMetrics, parseFont, sampleMetrics } from "./src/font";
+export { renderReport } from "./src/report";
+export { LATIN_SAMPLE, LATIN_TEXT_SAMPLE } from "./src/samples";
+export {
+  type CompareScore,
+  type GlyphDelta,
+  type ScoreOptions,
+  scoreAdvances,
+} from "./src/score";
+export {
+  type CompareModel,
+  type CompareTier,
+  classifyTier,
+} from "./src/tiers";
 
 const REPO_DIR = join(import.meta.dir, "..", "..");
 const DEFAULT_CACHE_DIR = join(REPO_DIR, ".cache", "corpus");
-const SNAPSHOT_FILE = "source-snapshot.json";
-const RAW_SFNT_EXTENSIONS = [".otf", ".ttf"];
-
-// --- Latin sample -----------------------------------------------------------
-
-/** Inclusive codepoint range helper for building the sample. */
-function codepointRange(start: number, end: number): number[] {
-  const out: number[] = [];
-  for (let cp = start; cp <= end; cp++) out.push(cp);
-  return out;
-}
-
-/**
- * Fixed Latin sample for advance comparison: every printable ASCII codepoint (U+0020 space through
- * U+007E tilde), Latin-1 letters with diacritics, and common punctuation/symbols a document is likely
- * to use. Named and tested so the metric is reproducible. Stored as numeric codepoints, sorted and
- * unique.
- */
-export const LATIN_SAMPLE: readonly number[] = (() => {
-  const latin1 = codepointRange(0x00a0, 0x00ff).filter((cp) => cp !== 0x00ad);
-  const generalPunctuation = [
-    0x2013, 0x2014, 0x2018, 0x2019, 0x201c, 0x201d, 0x2020, 0x2021, 0x2022,
-    0x2026, 0x2030, 0x2039, 0x203a, 0x20ac, 0x2122,
-  ];
-  const all = [...codepointRange(0x20, 0x7e), ...latin1, ...generalPunctuation];
-  return [...new Set(all)].sort((a, b) => a - b);
-})();
-
-const TEXT_PUNCTUATION = new Set([
-  0x20, // space
-  0x21, // !
-  0x22, // "
-  0x23, // #
-  0x26, // &
-  0x27, // '
-  0x28, // (
-  0x29, // )
-  0x2c, // ,
-  0x2d, // -
-  0x2e, // .
-  0x2f, // /
-  0x3a, // :
-  0x3b, // ;
-  0x3f, // ?
-  0x40, // @
-  0x5b, // [
-  0x5d, // ]
-  0x7b, // {
-  0x7d, // }
-  0x00a0, // no-break space
-  0x2013, // en dash
-  0x2014, // em dash
-  0x2018, // left single quote
-  0x2019, // right single quote
-  0x201c, // left double quote
-  0x201d, // right double quote
-  0x2026, // ellipsis
-]);
-
-const EXCLUDED_TEXT_LETTERS = new Set([
-  0x00b5, // micro sign: Unicode treats it as a letter, but it behaves like a symbol here.
-]);
-
-function isTextLetterOrDigit(codepoint: number): boolean {
-  if (EXCLUDED_TEXT_LETTERS.has(codepoint)) return false;
-  return /^[\p{L}\p{N}]$/u.test(String.fromCodePoint(codepoint));
-}
-
-/**
- * Text-carrying Latin sample used to rank proportional-font candidates. The full sample still reports
- * outliers, but rare symbols should not hide a strong body-text lead.
- */
-export const LATIN_TEXT_SAMPLE: readonly number[] = LATIN_SAMPLE.filter(
-  (cp) => TEXT_PUNCTUATION.has(cp) || isTextLetterOrDigit(cp),
-);
-
-// --- Tiers ------------------------------------------------------------------
-
-/**
- * Advance-fidelity tier. Thresholds mirror the package's verdict language (see `src/types.ts`):
- * metric_safe is the DIRECT band, near_metric the LIKELY band, everything else visual_only.
- * cell_width_only is the monospace model's verdict for a matching cell: it proves line width, not
- * glyph-shape fidelity.
- */
-export type CompareTier =
-  | "metric_safe"
-  | "near_metric"
-  | "cell_width_only"
-  | "visual_only";
-
-/**
- * Classification model. `latin` is the default proportional comparison. `monospace` treats a matching
- * advance as proof of cell width only, since every glyph in a monospace cell shares one advance.
- */
-export type CompareModel = "latin" | "monospace";
-
-const TIER_RANK: Record<CompareTier, number> = {
-  metric_safe: 0,
-  near_metric: 1,
-  cell_width_only: 2,
-  visual_only: 3,
-};
-
-/**
- * Classify a (mean, max) advance-delta pair into a fidelity tier. Deltas are fractions of the em. Under
- * the monospace model a matching cell only vouches for line width, so the metric bands collapse to
- * cell_width_only while non-matching candidates stay visual_only.
- */
-export function classifyTier(
-  meanDelta: number,
-  maxDelta: number,
-  model: CompareModel = "latin",
-): CompareTier {
-  let tier: CompareTier = "visual_only";
-  if (meanDelta <= 0.005 && maxDelta <= 0.01) tier = "metric_safe";
-  else if (meanDelta <= 0.01 && maxDelta <= 0.025) tier = "near_metric";
-  if (model === "monospace" && tier !== "visual_only") return "cell_width_only";
-  return tier;
-}
-
-// --- SFNT parsing -----------------------------------------------------------
-
-const REQUIRED_TABLES = ["head", "maxp", "hhea", "hmtx", "cmap"] as const;
-
-/** A parsed font's em size plus a normalized advance lookup over its Unicode `cmap`. */
-export interface FontMetrics {
-  unitsPerEm: number;
-  /** Advance width of a codepoint as a fraction of the em, or undefined when the font does not map it. */
-  normalizedAdvance(codepoint: number): number | undefined;
-}
-
-function tagAt(view: DataView, offset: number): string {
-  return String.fromCharCode(
-    view.getUint8(offset),
-    view.getUint8(offset + 1),
-    view.getUint8(offset + 2),
-    view.getUint8(offset + 3),
-  );
-}
-
-/** Resolve a codepoint to a glyph id within one `cmap` subtable, for the formats we support (4, 6, 12). */
-function makeCmapLookup(
-  view: DataView,
-  subOffset: number,
-): (codepoint: number) => number | undefined {
-  const format = view.getUint16(subOffset);
-
-  if (format === 4) {
-    const segX2 = view.getUint16(subOffset + 6);
-    const segCount = segX2 / 2;
-    const endOffset = subOffset + 14;
-    const startOffset = endOffset + segX2 + 2; // skip reservedPad
-    const deltaOffset = startOffset + segX2;
-    const rangeOffsetBase = deltaOffset + segX2;
-    return (cp) => {
-      if (cp > 0xffff) return undefined;
-      for (let i = 0; i < segCount; i++) {
-        const end = view.getUint16(endOffset + i * 2);
-        if (cp > end) continue;
-        const start = view.getUint16(startOffset + i * 2);
-        if (cp < start) return undefined;
-        const delta = view.getInt16(deltaOffset + i * 2);
-        const rangeOffset = view.getUint16(rangeOffsetBase + i * 2);
-        if (rangeOffset === 0) {
-          const gid = (cp + delta) & 0xffff;
-          return gid === 0 ? undefined : gid;
-        }
-        const glyphOffset =
-          rangeOffsetBase + i * 2 + rangeOffset + (cp - start) * 2;
-        const raw = view.getUint16(glyphOffset);
-        if (raw === 0) return undefined;
-        const gid = (raw + delta) & 0xffff;
-        return gid === 0 ? undefined : gid;
-      }
-      return undefined;
-    };
-  }
-
-  if (format === 6) {
-    const firstCode = view.getUint16(subOffset + 6);
-    const entryCount = view.getUint16(subOffset + 8);
-    return (cp) => {
-      if (cp < firstCode || cp >= firstCode + entryCount) return undefined;
-      const gid = view.getUint16(subOffset + 10 + (cp - firstCode) * 2);
-      return gid === 0 ? undefined : gid;
-    };
-  }
-
-  if (format === 12) {
-    const numGroups = view.getUint32(subOffset + 12);
-    const groupsOffset = subOffset + 16;
-    return (cp) => {
-      let lo = 0;
-      let hi = numGroups - 1;
-      while (lo <= hi) {
-        const mid = (lo + hi) >> 1;
-        const g = groupsOffset + mid * 12;
-        const start = view.getUint32(g);
-        const end = view.getUint32(g + 4);
-        if (cp < start) hi = mid - 1;
-        else if (cp > end) lo = mid + 1;
-        else {
-          const gid = view.getUint32(g + 8) + (cp - start);
-          return gid === 0 ? undefined : gid;
-        }
-      }
-      return undefined;
-    };
-  }
-
-  throw new Error(`unsupported cmap subtable format: ${format}`);
-}
-
-/** Pick the best Unicode `cmap` subtable and return its glyph lookup. */
-function readCmap(
-  view: DataView,
-  cmapOffset: number,
-): (codepoint: number) => number | undefined {
-  const numSubtables = view.getUint16(cmapOffset + 2);
-  const candidates: { score: number; offset: number }[] = [];
-  for (let i = 0; i < numSubtables; i++) {
-    const recordOffset = cmapOffset + 4 + i * 8;
-    const platformId = view.getUint16(recordOffset);
-    const encodingId = view.getUint16(recordOffset + 2);
-    const score = cmapPreference(platformId, encodingId);
-    // Skip non-Unicode subtables (Macintosh, Windows symbol, ...): their codepoints are not Unicode,
-    // so reading Latin advances through them would be wrong. We never fall back to one.
-    if (score === null) continue;
-    candidates.push({
-      score,
-      offset: cmapOffset + view.getUint32(recordOffset + 4),
-    });
-  }
-  candidates.sort((a, b) => b.score - a.score);
-
-  for (const candidate of candidates) {
-    const format = view.getUint16(candidate.offset);
-    if (format === 4 || format === 6 || format === 12)
-      return makeCmapLookup(view, candidate.offset);
-  }
-  throw new Error("unsupported font: no readable Unicode cmap subtable");
-}
-
-/** Rank Unicode `cmap` subtables (full Unicode first, then BMP); null for non-Unicode subtables. */
-function cmapPreference(platformId: number, encodingId: number): number | null {
-  if (platformId === 3 && encodingId === 10) return 4; // Windows Unicode UCS-4
-  if (platformId === 0 && (encodingId === 4 || encodingId === 6)) return 3; // Unicode full
-  if (platformId === 3 && encodingId === 1) return 2; // Windows Unicode BMP
-  if (platformId === 0) return 1; // Unicode BMP and earlier
-  return null; // Macintosh, Windows symbol, and anything else: not a Unicode cmap
-}
-
-/**
- * Parse just enough of an SFNT font (TrueType or CFF/OTF) to read normalized advance widths by
- * codepoint. Throws an explicit error when the container is a collection or a required table is missing.
- */
-export function parseFont(bytes: Uint8Array): FontMetrics {
-  const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
-  if (bytes.byteLength < 12)
-    throw new Error("unsupported font: file is too small to be an SFNT");
-
-  const sfntVersion = view.getUint32(0);
-  if (sfntVersion === 0x74746366)
-    throw new Error("unsupported font: TrueType/OpenType collections (ttcf)");
-  const isSfnt =
-    sfntVersion === 0x00010000 || // TrueType outlines
-    sfntVersion === 0x4f54544f || // 'OTTO' - CFF outlines
-    sfntVersion === 0x74727565; // 'true'
-  if (!isSfnt)
-    throw new Error(
-      `unsupported font: not an SFNT (sfntVersion 0x${sfntVersion.toString(16)})`,
-    );
-
-  const numTables = view.getUint16(4);
-  const tables = new Map<string, number>();
-  for (let i = 0; i < numTables; i++) {
-    const recordOffset = 12 + i * 16;
-    tables.set(tagAt(view, recordOffset), view.getUint32(recordOffset + 8));
-  }
-
-  const missing = REQUIRED_TABLES.filter((tag) => !tables.has(tag));
-  if (missing.length > 0)
-    throw new Error(
-      `unsupported font: missing required table(s): ${missing.join(", ")}`,
-    );
-
-  const headOffset = tables.get("head") as number;
-  const unitsPerEm = view.getUint16(headOffset + 18);
-  if (unitsPerEm === 0)
-    throw new Error("unsupported font: head.unitsPerEm is zero");
-
-  const numberOfHMetrics = view.getUint16((tables.get("hhea") as number) + 34);
-  if (numberOfHMetrics === 0)
-    throw new Error("unsupported font: hhea.numberOfHMetrics is zero");
-
-  const hmtxOffset = tables.get("hmtx") as number;
-  const advanceOfGlyph = (glyphId: number): number => {
-    const index = glyphId < numberOfHMetrics ? glyphId : numberOfHMetrics - 1;
-    return view.getUint16(hmtxOffset + index * 4);
-  };
-
-  const lookup = readCmap(view, tables.get("cmap") as number);
-
-  return {
-    unitsPerEm,
-    normalizedAdvance(codepoint: number): number | undefined {
-      const glyphId = lookup(codepoint);
-      if (glyphId === undefined) return undefined;
-      return advanceOfGlyph(glyphId) / unitsPerEm;
-    },
-  };
-}
-
-// --- Scoring ----------------------------------------------------------------
-
-/** One codepoint whose advance diverges, for the "worst glyphs" column. */
-export interface GlyphDelta {
-  codepoint: number;
-  delta: number;
-}
-
-/** The advance-parity score of one candidate font against the reference, over a fixed sample. */
-export interface CompareScore {
-  /** codepoints in the tier sample that both fonts map. */
-  compared: number;
-  /** tier sample size. */
-  total: number;
-  /** tier sample codepoints not mapped by both fonts. */
-  missing: number;
-  meanDelta: number;
-  maxDelta: number;
-  /** shared report-sample codepoints whose advance delta exceeds the metric_safe max threshold. */
-  over1Percent: number;
-  /** shared report-sample codepoints whose advance delta exceeds the near_metric max threshold. */
-  over2_5Percent: number;
-  tier: CompareTier;
-  worstGlyphs: GlyphDelta[];
-}
-
-export interface ScoreOptions {
-  /** Sample used for outlier reporting and worst-glyph display. */
-  reportSample?: readonly number[];
-  /** Sample used for tier classification and mean/max columns. Defaults to `reportSample`. */
-  tierSample?: readonly number[];
-  worstCount?: number;
-  model?: CompareModel;
-}
-
-interface MeasuredDeltas {
-  compared: number;
-  total: number;
-  missing: number;
-  meanDelta: number;
-  maxDelta: number;
-  over1Percent: number;
-  over2_5Percent: number;
-  worstGlyphs: GlyphDelta[];
-}
-
-function measureDeltas(
-  reference: ReadonlyMap<number, number>,
-  candidate: ReadonlyMap<number, number>,
-  sample: readonly number[],
-  worstCount: number,
-): MeasuredDeltas {
-  const deltas: GlyphDelta[] = [];
-  let sum = 0;
-  let max = 0;
-  let over1Percent = 0;
-  let over2_5Percent = 0;
-  for (const cp of sample) {
-    const a = reference.get(cp);
-    const b = candidate.get(cp);
-    if (a === undefined || b === undefined) continue;
-    const delta = Math.abs(a - b);
-    deltas.push({ codepoint: cp, delta });
-    sum += delta;
-    if (delta > max) max = delta;
-    if (delta > 0.01) over1Percent++;
-    if (delta > 0.025) over2_5Percent++;
-  }
-
-  const compared = deltas.length;
-  const meanDelta = compared === 0 ? Number.NaN : sum / compared;
-  const maxDelta = compared === 0 ? Number.NaN : max;
-  const worstGlyphs = [...deltas]
-    .sort((x, y) => y.delta - x.delta)
-    .slice(0, worstCount)
-    .filter((g) => g.delta > 0);
-
-  return {
-    compared,
-    total: sample.length,
-    missing: sample.length - compared,
-    meanDelta,
-    maxDelta,
-    over1Percent,
-    over2_5Percent,
-    worstGlyphs,
-  };
-}
-
-function normalizeScoreOptions(
-  optionsOrSample: ScoreOptions | readonly number[] | undefined,
-  worstCount: number | undefined,
-  model: CompareModel | undefined,
-): Required<ScoreOptions> {
-  if (!optionsOrSample || Array.isArray(optionsOrSample)) {
-    const reportSample = optionsOrSample ?? LATIN_SAMPLE;
-    return {
-      reportSample,
-      tierSample: reportSample,
-      worstCount: worstCount ?? 3,
-      model: model ?? "latin",
-    };
-  }
-
-  const options = optionsOrSample as ScoreOptions;
-  const reportSample = options.reportSample ?? LATIN_SAMPLE;
-  return {
-    reportSample,
-    tierSample: options.tierSample ?? reportSample,
-    worstCount: options.worstCount ?? 3,
-    model: options.model ?? "latin",
-  };
-}
-
-/**
- * Score one candidate against the reference. The tier can use a narrower text sample while the report
- * still surfaces full-sample outliers. Both inputs are normalized advance maps (codepoint ->
- * advance/unitsPerEm); only codepoints present in both are compared.
- */
-export function scoreAdvances(
-  reference: ReadonlyMap<number, number>,
-  candidate: ReadonlyMap<number, number>,
-  optionsOrSample?: ScoreOptions | readonly number[],
-  worstCount?: number,
-  model?: CompareModel,
-): CompareScore {
-  const options = normalizeScoreOptions(optionsOrSample, worstCount, model);
-  const report = measureDeltas(
-    reference,
-    candidate,
-    options.reportSample,
-    options.worstCount,
-  );
-  const tierMetrics =
-    options.tierSample === options.reportSample
-      ? report
-      : measureDeltas(reference, candidate, options.tierSample, 0);
-  return {
-    compared: tierMetrics.compared,
-    total: tierMetrics.total,
-    missing: tierMetrics.missing,
-    meanDelta: tierMetrics.meanDelta,
-    maxDelta: tierMetrics.maxDelta,
-    over1Percent: report.over1Percent,
-    over2_5Percent: report.over2_5Percent,
-    tier:
-      tierMetrics.compared === 0
-        ? "visual_only"
-        : classifyTier(
-            tierMetrics.meanDelta,
-            tierMetrics.maxDelta,
-            options.model,
-          ),
-    worstGlyphs: report.worstGlyphs,
-  };
-}
-
-/** Build a font's normalized-advance map over the sample (only codepoints it maps are included). */
-export function sampleMetrics(
-  font: FontMetrics,
-  sample: readonly number[] = LATIN_SAMPLE,
-): Map<number, number> {
-  const map = new Map<number, number>();
-  for (const cp of sample) {
-    const advance = font.normalizedAdvance(cp);
-    if (advance !== undefined) map.set(cp, advance);
-  }
-  return map;
-}
-
-// --- Source cache + candidates ---------------------------------------------
-
-/** One snapshot file entry: a font member by path, with its display name. */
-interface SnapshotFile {
-  name: string;
-  path: string;
-}
-
-type ArchiveFormat = "zip" | "tar.gz";
-
-/**
- * A source as recorded in `source-snapshot.json`. Archive sources extract their candidate fonts from a
- * cached release archive; GitHub tree sources read each `files[].path` directly from the cache. `kind` is
- * optional so older snapshots (archive-only) still load and default to archive behavior.
- */
-export interface SnapshotSource {
-  sourceId: string;
-  family: string;
-  targetFamilies: string[];
-  kind?: "archive" | "github-tree";
-  archiveFormat?: ArchiveFormat;
-  files?: SnapshotFile[];
-}
-
-/** A candidate font ready to score: its display name and raw bytes. */
-export interface CandidateFile {
-  file: string;
-  bytes: Uint8Array;
-}
-
-const archiveFormatOf = (source: SnapshotSource): ArchiveFormat =>
-  source.archiveFormat ?? "zip";
-
-const archiveExtensions: Record<ArchiveFormat, string> = {
-  zip: "zip",
-  "tar.gz": "tar.gz",
-};
-
-function requireArchiveTool(format: ArchiveFormat): void {
-  const tool = format === "tar.gz" ? "tar" : "unzip";
-  const probe = format === "tar.gz" ? "--version" : "-v";
-  try {
-    execFileSync(tool, [probe], { stdio: "ignore" });
-  } catch {
-    throw new Error(`\`${tool}\` is required on PATH.`);
-  }
-}
-
-function isFontFile(path: string): boolean {
-  return RAW_SFNT_EXTENSIONS.some((ext) => path.toLowerCase().endsWith(ext));
-}
-
-/** Font members inside a source archive, by their in-archive path. */
-function listFontMembers(archivePath: string, format: ArchiveFormat): string[] {
-  const out =
-    format === "tar.gz"
-      ? execFileSync("tar", ["-tzf", archivePath], { encoding: "utf8" })
-      : execFileSync("unzip", ["-Z1", archivePath], { encoding: "utf8" });
-  return out
-    .split("\n")
-    .map((line) => line.trim())
-    .filter(Boolean)
-    .filter(isFontFile);
-}
-
-// `unzip -p` matches its member argument as a glob, so members with literal glob
-// metacharacters (e.g. variable-font names like `NotoSans-Italic[wdth,wght].ttf`)
-// must be escaped to extract by exact name.
-const escapeArchiveMember = (name: string): string =>
-  name.replace(/[\\*?[\]]/g, "\\$&");
-
-function readArchiveMember(
-  archivePath: string,
-  member: string,
-  format: ArchiveFormat,
-): Uint8Array {
-  const opts = { maxBuffer: 256 * 1024 * 1024 };
-  return new Uint8Array(
-    format === "tar.gz"
-      ? execFileSync("tar", ["-xzOf", archivePath, "--", member], opts)
-      : execFileSync(
-          "unzip",
-          ["-p", archivePath, escapeArchiveMember(member)],
-          opts,
-        ),
-  );
-}
-
-/** Load the acquire snapshot, failing explicitly when the cache or snapshot is absent. */
-function loadSnapshot(cacheDir: string): SnapshotSource[] {
-  if (!existsSync(cacheDir))
-    throw new Error(
-      `source cache not found at ${cacheDir}. Run \`bun run corpus:acquire\` first.`,
-    );
-  const snapshotPath = join(cacheDir, SNAPSHOT_FILE);
-  if (!existsSync(snapshotPath))
-    throw new Error(
-      `${SNAPSHOT_FILE} not found in ${cacheDir}. Run \`bun run corpus:acquire\` first.`,
-    );
-  const parsed = JSON.parse(readFileSync(snapshotPath, "utf8")) as {
-    snapshots?: SnapshotSource[];
-  };
-  const snapshots = parsed.snapshots ?? [];
-  if (snapshots.length === 0)
-    throw new Error(`${SNAPSHOT_FILE} lists no acquired sources.`);
-  return snapshots;
-}
-
-/**
- * Collect the candidate fonts for one source from the cache. GitHub tree sources read each snapshot file
- * entry directly; archive sources list and extract font members from the cached release archive. Throws
- * when an expected cache file is absent so the caller can point the user back at `bun run corpus:acquire`.
- */
-export function collectCandidates(
-  source: SnapshotSource,
-  cacheDir: string,
-): CandidateFile[] {
-  if (source.kind === "github-tree") {
-    const files = source.files ?? [];
-    if (files.length === 0)
-      throw new Error(`no candidate files listed for ${source.sourceId}`);
-    return files.map((entry) => {
-      const filePath = join(cacheDir, entry.path);
-      if (!existsSync(filePath))
-        throw new Error(
-          `candidate file missing for ${source.sourceId}: ${filePath}. Run \`bun run corpus:acquire\` first.`,
-        );
-      return { file: entry.name, bytes: readFileSync(filePath) };
-    });
-  }
-
-  const format = archiveFormatOf(source);
-  const archivePath = join(
-    cacheDir,
-    `${source.sourceId}.${archiveExtensions[format]}`,
-  );
-  if (!existsSync(archivePath))
-    throw new Error(
-      `candidate archive missing for ${source.sourceId}: ${archivePath}. Run \`bun run corpus:acquire\` first.`,
-    );
-  const members = listFontMembers(archivePath, format);
-  if (members.length === 0)
-    throw new Error(`no candidate font files in ${archivePath}`);
-
-  const basenameCounts = new Map<string, number>();
-  for (const member of members) {
-    const file = basename(member);
-    basenameCounts.set(file, (basenameCounts.get(file) ?? 0) + 1);
-  }
-  const duplicateBasenames = new Set(
-    [...basenameCounts].filter(([, count]) => count > 1).map(([file]) => file),
-  );
-
-  return members.map((member) => ({
-    file: displayNameForMember(member, duplicateBasenames),
-    bytes: readArchiveMember(archivePath, member, format),
-  }));
-}
-
-// --- CLI --------------------------------------------------------------------
 
 interface CompareRow {
   sourceId: string;
@@ -721,87 +111,53 @@ export function parseArgs(argv: string[]): ParsedArgs {
   return args;
 }
 
-function formatCodepoint(cp: number): string {
-  return `U+${cp.toString(16).toUpperCase().padStart(4, "0")}`;
-}
+function selectSources(
+  snapshot: SnapshotSource[],
+  requestedIds: string[],
+): SnapshotSource[] {
+  if (requestedIds.length === 0) return snapshot;
 
-function formatDelta(value: number): string {
-  return Number.isNaN(value) ? "n/a" : value.toFixed(4);
-}
-
-function formatWorst(worst: GlyphDelta[]): string {
-  if (worst.length === 0) return "-";
-  return worst
-    .map((g) => `${formatCodepoint(g.codepoint)} ${g.delta.toFixed(4)}`)
-    .join("; ");
+  const byId = new Map(snapshot.map((source) => [source.sourceId, source]));
+  const unknown = requestedIds.filter((id) => !byId.has(id));
+  if (unknown.length > 0)
+    throw new Error(
+      `source(s) not in cache: ${unknown.join(", ")}. Acquired: ${[...byId.keys()].join(", ")}`,
+    );
+  return requestedIds.map((id) => byId.get(id) as SnapshotSource);
 }
 
-interface RenderOptions {
-  limit?: number | null;
+function scoreSources(
+  reference: ReadonlyMap<number, number>,
+  selected: SnapshotSource[],
+  cacheDir: string,
+  model: CompareModel,
+): { rows: CompareRow[]; skipped: number } {
+  const rows: CompareRow[] = [];
+  let skipped = 0;
+  for (const source of selected) {
+    for (const candidate of collectCandidates(source, cacheDir)) {
+      try {
+        const font = parseFont(candidate.bytes);
+        const score = scoreAdvances(reference, sampleMetrics(font), {
+          reportSample: LATIN_SAMPLE,
+          tierSample: model === "latin" ? LATIN_TEXT_SAMPLE : LATIN_SAMPLE,
+          model,
+        });
+        rows.push({ sourceId: source.sourceId, file: candidate.file, score });
+      } catch {
+        skipped++;
+      }
+    }
+  }
+  return { rows, skipped };
 }
 
-/** Render the ranked table. Returned as a string so it can be tested without capturing stdout. */
-export function renderReport(
-  rows: CompareRow[],
-  options: RenderOptions = {},
-): string {
-  const ranked = [...rows].sort((a, b) => {
-    const tierDiff = TIER_RANK[a.score.tier] - TIER_RANK[b.score.tier];
-    if (tierDiff !== 0) return tierDiff;
-    const aMean = Number.isNaN(a.score.meanDelta)
-      ? Infinity
-      : a.score.meanDelta;
-    const bMean = Number.isNaN(b.score.meanDelta)
-      ? Infinity
-      : b.score.meanDelta;
-    return aMean - bMean;
-  });
-
-  const visible =
-    options.limit === null ? ranked : ranked.slice(0, options.limit);
-
-  const header = [
-    "source",
-    "file",
-    "mean",
-    "max",
-    "tier",
-    "coverage",
-    "missing",
-    "over1",
-    "over2.5",
-    "worst",
-  ];
-  const body = visible.map((row) => [
-    row.sourceId,
-    row.file,
-    formatDelta(row.score.meanDelta),
-    formatDelta(row.score.maxDelta),
-    row.score.tier,
-    `${row.score.compared}/${row.score.total}`,
-    String(row.score.missing),
-    String(row.score.over1Percent),
-    String(row.score.over2_5Percent),
-    formatWorst(row.score.worstGlyphs),
-  ]);
-
-  const widths = header.map((h, col) =>
-    Math.max(h.length, ...body.map((r) => r[col].length)),
+function requireArchiveTools(selected: SnapshotSource[]): void {
+  const archiveSources = selected.filter(
+    (source) => source.kind !== "github-tree",
   );
-  const line = (cells: string[]) =>
-    cells
-      .map((cell, col) => cell.padEnd(widths[col]))
-      .join("  ")
-      .trimEnd();
-  return [line(header), ...body.map(line)].join("\n");
-}
-
-function displayNameForMember(
-  member: string,
-  duplicateBasenames: Set<string>,
-): string {
-  const file = basename(member);
-  return duplicateBasenames.has(file) ? member : file;
+  for (const format of new Set(archiveSources.map(archiveFormatOf)))
+    requireArchiveTool(format);
 }
 
 function main(): void {
@@ -815,46 +171,16 @@ function main(): void {
     throw new Error(`reference font not found: ${args.reference}`);
 
   const cacheDir = process.env.DOCFONTS_SOURCE_CACHE ?? DEFAULT_CACHE_DIR;
-  const snapshot = loadSnapshot(cacheDir);
-  const byId = new Map(snapshot.map((source) => [source.sourceId, source]));
-
-  let selected: SnapshotSource[];
-  if (args.sources.length > 0) {
-    const unknown = args.sources.filter((id) => !byId.has(id));
-    if (unknown.length > 0)
-      throw new Error(
-        `source(s) not in cache: ${unknown.join(", ")}. Acquired: ${[...byId.keys()].join(", ")}`,
-      );
-    selected = args.sources.map((id) => byId.get(id) as SnapshotSource);
-  } else {
-    selected = snapshot;
-  }
-
-  const archiveSources = selected.filter(
-    (source) => source.kind !== "github-tree",
-  );
-  for (const format of new Set(archiveSources.map(archiveFormatOf)))
-    requireArchiveTool(format);
+  const selected = selectSources(loadSnapshot(cacheDir), args.sources);
+  requireArchiveTools(selected);
 
   const reference = sampleMetrics(parseFont(readFileSync(args.reference)));
-
-  const rows: CompareRow[] = [];
-  let skipped = 0;
-  for (const source of selected) {
-    for (const candidate of collectCandidates(source, cacheDir)) {
-      try {
-        const font = parseFont(candidate.bytes);
-        const score = scoreAdvances(reference, sampleMetrics(font), {
-          reportSample: LATIN_SAMPLE,
-          tierSample: args.model === "latin" ? LATIN_TEXT_SAMPLE : LATIN_SAMPLE,
-          model: args.model,
-        });
-        rows.push({ sourceId: source.sourceId, file: candidate.file, score });
-      } catch {
-        skipped++;
-      }
-    }
-  }
+  const { rows, skipped } = scoreSources(
+    reference,
+    selected,
+    cacheDir,
+    args.model,
+  );
 
   const label = args.family ?? "(family not specified)";
   const shown =
diff --git a/tools/corpus/src/cache.ts b/tools/corpus/src/cache.ts
new file mode 100644
index 0000000..053d2bc
--- /dev/null
+++ b/tools/corpus/src/cache.ts
@@ -0,0 +1,171 @@
+import { execFileSync } from "node:child_process";
+import { existsSync, readFileSync } from "node:fs";
+import { basename, join } from "node:path";
+
+const RAW_SFNT_EXTENSIONS = [".otf", ".ttf"];
+const SNAPSHOT_FILE = "source-snapshot.json";
+
+/** One snapshot file entry: a font member by path, with its display name. */
+interface SnapshotFile {
+  name: string;
+  path: string;
+}
+
+export type ArchiveFormat = "zip" | "tar.gz";
+
+/**
+ * A source as recorded in `source-snapshot.json`. Archive sources extract their candidate fonts from a
+ * cached release archive; GitHub tree sources read each `files[].path` directly from the cache. `kind` is
+ * optional so older snapshots (archive-only) still load and default to archive behavior.
+ */
+export interface SnapshotSource {
+  sourceId: string;
+  family: string;
+  targetFamilies: string[];
+  kind?: "archive" | "github-tree";
+  archiveFormat?: ArchiveFormat;
+  files?: SnapshotFile[];
+}
+
+/** A candidate font ready to score: its display name and raw bytes. */
+export interface CandidateFile {
+  file: string;
+  bytes: Uint8Array;
+}
+
+export const archiveFormatOf = (source: SnapshotSource): ArchiveFormat =>
+  source.archiveFormat ?? "zip";
+
+const archiveExtensions: Record<ArchiveFormat, string> = {
+  zip: "zip",
+  "tar.gz": "tar.gz",
+};
+
+export function requireArchiveTool(format: ArchiveFormat): void {
+  const tool = format === "tar.gz" ? "tar" : "unzip";
+  const probe = format === "tar.gz" ? "--version" : "-v";
+  try {
+    execFileSync(tool, [probe], { stdio: "ignore" });
+  } catch {
+    throw new Error(`\`${tool}\` is required on PATH.`);
+  }
+}
+
+function isFontFile(path: string): boolean {
+  return RAW_SFNT_EXTENSIONS.some((ext) => path.toLowerCase().endsWith(ext));
+}
+
+/** Font members inside a source archive, by their in-archive path. */
+function listFontMembers(archivePath: string, format: ArchiveFormat): string[] {
+  const out =
+    format === "tar.gz"
+      ? execFileSync("tar", ["-tzf", archivePath], { encoding: "utf8" })
+      : execFileSync("unzip", ["-Z1", archivePath], { encoding: "utf8" });
+  return out
+    .split("\n")
+    .map((line) => line.trim())
+    .filter(Boolean)
+    .filter(isFontFile);
+}
+
+// `unzip -p` matches its member argument as a glob, so members with literal glob
+// metacharacters (e.g. variable-font names like `NotoSans-Italic[wdth,wght].ttf`)
+// must be escaped to extract by exact name.
+const escapeArchiveMember = (name: string): string =>
+  name.replace(/[\\*?[\]]/g, "\\$&");
+
+function readArchiveMember(
+  archivePath: string,
+  member: string,
+  format: ArchiveFormat,
+): Uint8Array {
+  const opts = { maxBuffer: 256 * 1024 * 1024 };
+  return new Uint8Array(
+    format === "tar.gz"
+      ? execFileSync("tar", ["-xzOf", archivePath, "--", member], opts)
+      : execFileSync(
+          "unzip",
+          ["-p", archivePath, escapeArchiveMember(member)],
+          opts,
+        ),
+  );
+}
+
+/** Load the acquire snapshot, failing explicitly when the cache or snapshot is absent. */
+export function loadSnapshot(cacheDir: string): SnapshotSource[] {
+  if (!existsSync(cacheDir))
+    throw new Error(
+      `source cache not found at ${cacheDir}. Run \`bun run corpus:acquire\` first.`,
+    );
+  const snapshotPath = join(cacheDir, SNAPSHOT_FILE);
+  if (!existsSync(snapshotPath))
+    throw new Error(
+      `${SNAPSHOT_FILE} not found in ${cacheDir}. Run \`bun run corpus:acquire\` first.`,
+    );
+  const parsed = JSON.parse(readFileSync(snapshotPath, "utf8")) as {
+    snapshots?: SnapshotSource[];
+  };
+  const snapshots = parsed.snapshots ?? [];
+  if (snapshots.length === 0)
+    throw new Error(`${SNAPSHOT_FILE} lists no acquired sources.`);
+  return snapshots;
+}
+
+/**
+ * Collect the candidate fonts for one source from the cache. GitHub tree sources read each snapshot file
+ * entry directly; archive sources list and extract font members from the cached release archive. Throws
+ * when an expected cache file is absent so the caller can point the user back at `bun run corpus:acquire`.
+ */
+export function collectCandidates(
+  source: SnapshotSource,
+  cacheDir: string,
+): CandidateFile[] {
+  if (source.kind === "github-tree") {
+    const files = source.files ?? [];
+    if (files.length === 0)
+      throw new Error(`no candidate files listed for ${source.sourceId}`);
+    return files.map((entry) => {
+      const filePath = join(cacheDir, entry.path);
+      if (!existsSync(filePath))
+        throw new Error(
+          `candidate file missing for ${source.sourceId}: ${filePath}. Run \`bun run corpus:acquire\` first.`,
+        );
+      return { file: entry.name, bytes: readFileSync(filePath) };
+    });
+  }
+
+  const format = archiveFormatOf(source);
+  const archivePath = join(
+    cacheDir,
+    `${source.sourceId}.${archiveExtensions[format]}`,
+  );
+  if (!existsSync(archivePath))
+    throw new Error(
+      `candidate archive missing for ${source.sourceId}: ${archivePath}. Run \`bun run corpus:acquire\` first.`,
+    );
+  const members = listFontMembers(archivePath, format);
+  if (members.length === 0)
+    throw new Error(`no candidate font files in ${archivePath}`);
+
+  const basenameCounts = new Map<string, number>();
+  for (const member of members) {
+    const file = basename(member);
+    basenameCounts.set(file, (basenameCounts.get(file) ?? 0) + 1);
+  }
+  const duplicateBasenames = new Set(
+    [...basenameCounts].filter(([, count]) => count > 1).map(([file]) => file),
+  );
+
+  return members.map((member) => ({
+    file: displayNameForMember(member, duplicateBasenames),
+    bytes: readArchiveMember(archivePath, member, format),
+  }));
+}
+
+function displayNameForMember(
+  member: string,
+  duplicateBasenames: Set<string>,
+): string {
+  const file = basename(member);
+  return duplicateBasenames.has(file) ? member : file;
+}
diff --git a/tools/corpus/src/font.ts b/tools/corpus/src/font.ts
new file mode 100644
index 0000000..05da3d0
--- /dev/null
+++ b/tools/corpus/src/font.ts
@@ -0,0 +1,203 @@
+import { LATIN_SAMPLE } from "./samples";
+
+const REQUIRED_TABLES = ["head", "maxp", "hhea", "hmtx", "cmap"] as const;
+
+/** A parsed font's em size plus a normalized advance lookup over its Unicode `cmap`. */
+export interface FontMetrics {
+  unitsPerEm: number;
+  /** Advance width of a codepoint as a fraction of the em, or undefined when the font does not map it. */
+  normalizedAdvance(codepoint: number): number | undefined;
+}
+
+function tagAt(view: DataView, offset: number): string {
+  return String.fromCharCode(
+    view.getUint8(offset),
+    view.getUint8(offset + 1),
+    view.getUint8(offset + 2),
+    view.getUint8(offset + 3),
+  );
+}
+
+/** Resolve a codepoint to a glyph id within one `cmap` subtable, for the formats we support (4, 6, 12). */
+function makeCmapLookup(
+  view: DataView,
+  subOffset: number,
+): (codepoint: number) => number | undefined {
+  const format = view.getUint16(subOffset);
+
+  if (format === 4) {
+    const segX2 = view.getUint16(subOffset + 6);
+    const segCount = segX2 / 2;
+    const endOffset = subOffset + 14;
+    const startOffset = endOffset + segX2 + 2; // skip reservedPad
+    const deltaOffset = startOffset + segX2;
+    const rangeOffsetBase = deltaOffset + segX2;
+    return (cp) => {
+      if (cp > 0xffff) return undefined;
+      for (let i = 0; i < segCount; i++) {
+        const end = view.getUint16(endOffset + i * 2);
+        if (cp > end) continue;
+        const start = view.getUint16(startOffset + i * 2);
+        if (cp < start) return undefined;
+        const delta = view.getInt16(deltaOffset + i * 2);
+        const rangeOffset = view.getUint16(rangeOffsetBase + i * 2);
+        if (rangeOffset === 0) {
+          const gid = (cp + delta) & 0xffff;
+          return gid === 0 ? undefined : gid;
+        }
+        const glyphOffset =
+          rangeOffsetBase + i * 2 + rangeOffset + (cp - start) * 2;
+        const raw = view.getUint16(glyphOffset);
+        if (raw === 0) return undefined;
+        const gid = (raw + delta) & 0xffff;
+        return gid === 0 ? undefined : gid;
+      }
+      return undefined;
+    };
+  }
+
+  if (format === 6) {
+    const firstCode = view.getUint16(subOffset + 6);
+    const entryCount = view.getUint16(subOffset + 8);
+    return (cp) => {
+      if (cp < firstCode || cp >= firstCode + entryCount) return undefined;
+      const gid = view.getUint16(subOffset + 10 + (cp - firstCode) * 2);
+      return gid === 0 ? undefined : gid;
+    };
+  }
+
+  if (format === 12) {
+    const numGroups = view.getUint32(subOffset + 12);
+    const groupsOffset = subOffset + 16;
+    return (cp) => {
+      let lo = 0;
+      let hi = numGroups - 1;
+      while (lo <= hi) {
+        const mid = (lo + hi) >> 1;
+        const g = groupsOffset + mid * 12;
+        const start = view.getUint32(g);
+        const end = view.getUint32(g + 4);
+        if (cp < start) hi = mid - 1;
+        else if (cp > end) lo = mid + 1;
+        else {
+          const gid = view.getUint32(g + 8) + (cp - start);
+          return gid === 0 ? undefined : gid;
+        }
+      }
+      return undefined;
+    };
+  }
+
+  throw new Error(`unsupported cmap subtable format: ${format}`);
+}
+
+/** Pick the best Unicode `cmap` subtable and return its glyph lookup. */
+function readCmap(
+  view: DataView,
+  cmapOffset: number,
+): (codepoint: number) => number | undefined {
+  const numSubtables = view.getUint16(cmapOffset + 2);
+  const candidates: { score: number; offset: number }[] = [];
+  for (let i = 0; i < numSubtables; i++) {
+    const recordOffset = cmapOffset + 4 + i * 8;
+    const platformId = view.getUint16(recordOffset);
+    const encodingId = view.getUint16(recordOffset + 2);
+    const score = cmapPreference(platformId, encodingId);
+    if (score === null) continue;
+    candidates.push({
+      score,
+      offset: cmapOffset + view.getUint32(recordOffset + 4),
+    });
+  }
+  candidates.sort((a, b) => b.score - a.score);
+
+  for (const candidate of candidates) {
+    const format = view.getUint16(candidate.offset);
+    if (format === 4 || format === 6 || format === 12)
+      return makeCmapLookup(view, candidate.offset);
+  }
+  throw new Error("unsupported font: no readable Unicode cmap subtable");
+}
+
+/** Rank Unicode `cmap` subtables (full Unicode first, then BMP); null for non-Unicode subtables. */
+function cmapPreference(platformId: number, encodingId: number): number | null {
+  if (platformId === 3 && encodingId === 10) return 4; // Windows Unicode UCS-4
+  if (platformId === 0 && (encodingId === 4 || encodingId === 6)) return 3; // Unicode full
+  if (platformId === 3 && encodingId === 1) return 2; // Windows Unicode BMP
+  if (platformId === 0) return 1; // Unicode BMP and earlier
+  return null; // Macintosh, Windows symbol, and anything else: not a Unicode cmap
+}
+
+/**
+ * Parse just enough of an SFNT font (TrueType or CFF/OTF) to read normalized advance widths by
+ * codepoint. Throws an explicit error when the container is a collection or a required table is missing.
+ */
+export function parseFont(bytes: Uint8Array): FontMetrics {
+  const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
+  if (bytes.byteLength < 12)
+    throw new Error("unsupported font: file is too small to be an SFNT");
+
+  const sfntVersion = view.getUint32(0);
+  if (sfntVersion === 0x74746366)
+    throw new Error("unsupported font: TrueType/OpenType collections (ttcf)");
+  const isSfnt =
+    sfntVersion === 0x00010000 ||
+    sfntVersion === 0x4f54544f ||
+    sfntVersion === 0x74727565;
+  if (!isSfnt)
+    throw new Error(
+      `unsupported font: not an SFNT (sfntVersion 0x${sfntVersion.toString(16)})`,
+    );
+
+  const numTables = view.getUint16(4);
+  const tables = new Map<string, number>();
+  for (let i = 0; i < numTables; i++) {
+    const recordOffset = 12 + i * 16;
+    tables.set(tagAt(view, recordOffset), view.getUint32(recordOffset + 8));
+  }
+
+  const missing = REQUIRED_TABLES.filter((tag) => !tables.has(tag));
+  if (missing.length > 0)
+    throw new Error(
+      `unsupported font: missing required table(s): ${missing.join(", ")}`,
+    );
+
+  const headOffset = tables.get("head") as number;
+  const unitsPerEm = view.getUint16(headOffset + 18);
+  if (unitsPerEm === 0)
+    throw new Error("unsupported font: head.unitsPerEm is zero");
+
+  const numberOfHMetrics = view.getUint16((tables.get("hhea") as number) + 34);
+  if (numberOfHMetrics === 0)
+    throw new Error("unsupported font: hhea.numberOfHMetrics is zero");
+
+  const hmtxOffset = tables.get("hmtx") as number;
+  const advanceOfGlyph = (glyphId: number): number => {
+    const index = glyphId < numberOfHMetrics ? glyphId : numberOfHMetrics - 1;
+    return view.getUint16(hmtxOffset + index * 4);
+  };
+
+  const lookup = readCmap(view, tables.get("cmap") as number);
+
+  return {
+    unitsPerEm,
+    normalizedAdvance(codepoint: number): number | undefined {
+      const glyphId = lookup(codepoint);
+      if (glyphId === undefined) return undefined;
+      return advanceOfGlyph(glyphId) / unitsPerEm;
+    },
+  };
+}
+
+/** Build a font's normalized-advance map over the sample (only codepoints it maps are included). */
+export function sampleMetrics(
+  font: FontMetrics,
+  sample: readonly number[] = LATIN_SAMPLE,
+): Map<number, number> {
+  const map = new Map<number, number>();
+  for (const cp of sample) {
+    const advance = font.normalizedAdvance(cp);
+    if (advance !== undefined) map.set(cp, advance);
+  }
+  return map;
+}
diff --git a/tools/corpus/src/report.ts b/tools/corpus/src/report.ts
new file mode 100644
index 0000000..6fa75c8
--- /dev/null
+++ b/tools/corpus/src/report.ts
@@ -0,0 +1,83 @@
+import type { CompareScore, GlyphDelta } from "./score";
+import { TIER_RANK } from "./tiers";
+
+interface CompareRow {
+  sourceId: string;
+  file: string;
+  score: CompareScore;
+}
+
+interface RenderOptions {
+  limit?: number | null;
+}
+
+function formatCodepoint(cp: number): string {
+  return `U+${cp.toString(16).toUpperCase().padStart(4, "0")}`;
+}
+
+function formatDelta(value: number): string {
+  return Number.isNaN(value) ? "n/a" : value.toFixed(4);
+}
+
+function formatWorst(worst: GlyphDelta[]): string {
+  if (worst.length === 0) return "-";
+  return worst
+    .map((g) => `${formatCodepoint(g.codepoint)} ${g.delta.toFixed(4)}`)
+    .join("; ");
+}
+
+/** Render the ranked table. Returned as a string so it can be tested without capturing stdout. */
+export function renderReport(
+  rows: CompareRow[],
+  options: RenderOptions = {},
+): string {
+  const ranked = [...rows].sort((a, b) => {
+    const tierDiff = TIER_RANK[a.score.tier] - TIER_RANK[b.score.tier];
+    if (tierDiff !== 0) return tierDiff;
+    const aMean = Number.isNaN(a.score.meanDelta)
+      ? Infinity
+      : a.score.meanDelta;
+    const bMean = Number.isNaN(b.score.meanDelta)
+      ? Infinity
+      : b.score.meanDelta;
+    return aMean - bMean;
+  });
+
+  const visible =
+    options.limit === null ? ranked : ranked.slice(0, options.limit);
+
+  const header = [
+    "source",
+    "file",
+    "mean",
+    "max",
+    "tier",
+    "coverage",
+    "missing",
+    "over1",
+    "over2.5",
+    "worst",
+  ];
+  const body = visible.map((row) => [
+    row.sourceId,
+    row.file,
+    formatDelta(row.score.meanDelta),
+    formatDelta(row.score.maxDelta),
+    row.score.tier,
+    `${row.score.compared}/${row.score.total}`,
+    String(row.score.missing),
+    String(row.score.over1Percent),
+    String(row.score.over2_5Percent),
+    formatWorst(row.score.worstGlyphs),
+  ]);
+
+  const widths = header.map((h, col) =>
+    Math.max(h.length, ...body.map((r) => r[col].length)),
+  );
+  const line = (cells: string[]) =>
+    cells
+      .map((cell, col) => cell.padEnd(widths[col]))
+      .join("  ")
+      .trimEnd();
+  return [line(header), ...body.map(line)].join("\n");
+}
diff --git a/tools/corpus/src/samples.ts b/tools/corpus/src/samples.ts
new file mode 100644
index 0000000..bc1418e
--- /dev/null
+++ b/tools/corpus/src/samples.ts
@@ -0,0 +1,70 @@
+/** Inclusive codepoint range helper for building the sample. */
+function codepointRange(start: number, end: number): number[] {
+  const out: number[] = [];
+  for (let cp = start; cp <= end; cp++) out.push(cp);
+  return out;
+}
+
+/**
+ * Fixed Latin sample for advance comparison: every printable ASCII codepoint (U+0020 space through
+ * U+007E tilde), Latin-1 letters with diacritics, and common punctuation/symbols a document is likely
+ * to use. Named and tested so the metric is reproducible. Stored as numeric codepoints, sorted and
+ * unique.
+ */
+export const LATIN_SAMPLE: readonly number[] = (() => {
+  const latin1 = codepointRange(0x00a0, 0x00ff).filter((cp) => cp !== 0x00ad);
+  const generalPunctuation = [
+    0x2013, 0x2014, 0x2018, 0x2019, 0x201c, 0x201d, 0x2020, 0x2021, 0x2022,
+    0x2026, 0x2030, 0x2039, 0x203a, 0x20ac, 0x2122,
+  ];
+  const all = [...codepointRange(0x20, 0x7e), ...latin1, ...generalPunctuation];
+  return [...new Set(all)].sort((a, b) => a - b);
+})();
+
+const TEXT_PUNCTUATION = new Set([
+  0x20, // space
+  0x21, // !
+  0x22, // "
+  0x23, // #
+  0x26, // &
+  0x27, // '
+  0x28, // (
+  0x29, // )
+  0x2c, // ,
+  0x2d, // -
+  0x2e, // .
+  0x2f, // /
+  0x3a, // :
+  0x3b, // ;
+  0x3f, // ?
+  0x40, // @
+  0x5b, // [
+  0x5d, // ]
+  0x7b, // {
+  0x7d, // }
+  0x00a0, // no-break space
+  0x2013, // en dash
+  0x2014, // em dash codepoint
+  0x2018, // left single quote
+  0x2019, // right single quote
+  0x201c, // left double quote
+  0x201d, // right double quote
+  0x2026, // ellipsis
+]);
+
+const EXCLUDED_TEXT_LETTERS = new Set([
+  0x00b5, // micro sign: Unicode treats it as a letter, but it behaves like a symbol here.
+]);
+
+function isTextLetterOrDigit(codepoint: number): boolean {
+  if (EXCLUDED_TEXT_LETTERS.has(codepoint)) return false;
+  return /^[\p{L}\p{N}]$/u.test(String.fromCodePoint(codepoint));
+}
+
+/**
+ * Text-carrying Latin sample used to rank proportional-font candidates. The full sample still reports
+ * outliers, but rare symbols should not hide a strong body-text lead.
+ */
+export const LATIN_TEXT_SAMPLE: readonly number[] = LATIN_SAMPLE.filter(
+  (cp) => TEXT_PUNCTUATION.has(cp) || isTextLetterOrDigit(cp),
+);
diff --git a/tools/corpus/src/score.ts b/tools/corpus/src/score.ts
new file mode 100644
index 0000000..48790f6
--- /dev/null
+++ b/tools/corpus/src/score.ts
@@ -0,0 +1,157 @@
+import { LATIN_SAMPLE } from "./samples";
+import { type CompareModel, type CompareTier, classifyTier } from "./tiers";
+
+/** One codepoint whose advance diverges, for the "worst glyphs" column. */
+export interface GlyphDelta {
+  codepoint: number;
+  delta: number;
+}
+
+/** The advance-parity score of one candidate font against the reference, over a fixed sample. */
+export interface CompareScore {
+  /** codepoints in the tier sample that both fonts map. */
+  compared: number;
+  /** tier sample size. */
+  total: number;
+  /** tier sample codepoints not mapped by both fonts. */
+  missing: number;
+  meanDelta: number;
+  maxDelta: number;
+  /** shared report-sample codepoints whose advance delta exceeds the metric_safe max threshold. */
+  over1Percent: number;
+  /** shared report-sample codepoints whose advance delta exceeds the near_metric max threshold. */
+  over2_5Percent: number;
+  tier: CompareTier;
+  worstGlyphs: GlyphDelta[];
+}
+
+export interface ScoreOptions {
+  /** Sample used for outlier reporting and worst-glyph display. */
+  reportSample?: readonly number[];
+  /** Sample used for tier classification and mean/max columns. Defaults to `reportSample`. */
+  tierSample?: readonly number[];
+  worstCount?: number;
+  model?: CompareModel;
+}
+
+interface MeasuredDeltas {
+  compared: number;
+  total: number;
+  missing: number;
+  meanDelta: number;
+  maxDelta: number;
+  over1Percent: number;
+  over2_5Percent: number;
+  worstGlyphs: GlyphDelta[];
+}
+
+function measureDeltas(
+  reference: ReadonlyMap<number, number>,
+  candidate: ReadonlyMap<number, number>,
+  sample: readonly number[],
+  worstCount: number,
+): MeasuredDeltas {
+  const deltas: GlyphDelta[] = [];
+  let sum = 0;
+  let max = 0;
+  let over1Percent = 0;
+  let over2_5Percent = 0;
+  for (const cp of sample) {
+    const a = reference.get(cp);
+    const b = candidate.get(cp);
+    if (a === undefined || b === undefined) continue;
+    const delta = Math.abs(a - b);
+    deltas.push({ codepoint: cp, delta });
+    sum += delta;
+    if (delta > max) max = delta;
+    if (delta > 0.01) over1Percent++;
+    if (delta > 0.025) over2_5Percent++;
+  }
+
+  const compared = deltas.length;
+  const meanDelta = compared === 0 ? Number.NaN : sum / compared;
+  const maxDelta = compared === 0 ? Number.NaN : max;
+  const worstGlyphs = [...deltas]
+    .sort((x, y) => y.delta - x.delta)
+    .slice(0, worstCount)
+    .filter((g) => g.delta > 0);
+
+  return {
+    compared,
+    total: sample.length,
+    missing: sample.length - compared,
+    meanDelta,
+    maxDelta,
+    over1Percent,
+    over2_5Percent,
+    worstGlyphs,
+  };
+}
+
+function normalizeScoreOptions(
+  optionsOrSample: ScoreOptions | readonly number[] | undefined,
+  worstCount: number | undefined,
+  model: CompareModel | undefined,
+): Required<ScoreOptions> {
+  if (!optionsOrSample || Array.isArray(optionsOrSample)) {
+    const reportSample = optionsOrSample ?? LATIN_SAMPLE;
+    return {
+      reportSample,
+      tierSample: reportSample,
+      worstCount: worstCount ?? 3,
+      model: model ?? "latin",
+    };
+  }
+
+  const options = optionsOrSample as ScoreOptions;
+  const reportSample = options.reportSample ?? LATIN_SAMPLE;
+  return {
+    reportSample,
+    tierSample: options.tierSample ?? reportSample,
+    worstCount: options.worstCount ?? 3,
+    model: options.model ?? "latin",
+  };
+}
+
+/**
+ * Score one candidate against the reference. The tier can use a narrower text sample while the report
+ * still surfaces full-sample outliers. Both inputs are normalized advance maps (codepoint ->
+ * advance/unitsPerEm); only codepoints present in both are compared.
+ */
+export function scoreAdvances(
+  reference: ReadonlyMap<number, number>,
+  candidate: ReadonlyMap<number, number>,
+  optionsOrSample?: ScoreOptions | readonly number[],
+  worstCount?: number,
+  model?: CompareModel,
+): CompareScore {
+  const options = normalizeScoreOptions(optionsOrSample, worstCount, model);
+  const report = measureDeltas(
+    reference,
+    candidate,
+    options.reportSample,
+    options.worstCount,
+  );
+  const tierMetrics =
+    options.tierSample === options.reportSample
+      ? report
+      : measureDeltas(reference, candidate, options.tierSample, 0);
+  return {
+    compared: tierMetrics.compared,
+    total: tierMetrics.total,
+    missing: tierMetrics.missing,
+    meanDelta: tierMetrics.meanDelta,
+    maxDelta: tierMetrics.maxDelta,
+    over1Percent: report.over1Percent,
+    over2_5Percent: report.over2_5Percent,
+    tier:
+      tierMetrics.compared === 0
+        ? "visual_only"
+        : classifyTier(
+            tierMetrics.meanDelta,
+            tierMetrics.maxDelta,
+            options.model,
+          ),
+    worstGlyphs: report.worstGlyphs,
+  };
+}
diff --git a/tools/corpus/src/tiers.ts b/tools/corpus/src/tiers.ts
new file mode 100644
index 0000000..a8d83e2
--- /dev/null
+++ b/tools/corpus/src/tiers.ts
@@ -0,0 +1,41 @@
+/**
+ * Advance-fidelity tier. Thresholds mirror the package's verdict language (see `src/types.ts`):
+ * metric_safe is the DIRECT band, near_metric the LIKELY band, everything else visual_only.
+ * cell_width_only is the monospace model's verdict for a matching cell: it proves line width, not
+ * glyph-shape fidelity.
+ */
+export type CompareTier =
+  | "metric_safe"
+  | "near_metric"
+  | "cell_width_only"
+  | "visual_only";
+
+/**
+ * Classification model. `latin` is the default proportional comparison. `monospace` treats a matching
+ * advance as proof of cell width only, since every glyph in a monospace cell shares one advance.
+ */
+export type CompareModel = "latin" | "monospace";
+
+export const TIER_RANK: Record<CompareTier, number> = {
+  metric_safe: 0,
+  near_metric: 1,
+  cell_width_only: 2,
+  visual_only: 3,
+};
+
+/**
+ * Classify a (mean, max) advance-delta pair into a fidelity tier. Deltas are fractions of the em. Under
+ * the monospace model a matching cell only vouches for line width, so the metric bands collapse to
+ * cell_width_only while non-matching candidates stay visual_only.
+ */
+export function classifyTier(
+  meanDelta: number,
+  maxDelta: number,
+  model: CompareModel = "latin",
+): CompareTier {
+  let tier: CompareTier = "visual_only";
+  if (meanDelta <= 0.005 && maxDelta <= 0.01) tier = "metric_safe";
+  else if (meanDelta <= 0.01 && maxDelta <= 0.025) tier = "near_metric";
+  if (model === "monospace" && tier !== "visual_only") return "cell_width_only";
+  return tier;
+}

From 0461f70b2df4041e8a6ea4ecb02431ec09d654fa Mon Sep 17 00:00:00 2001
From: Caio Pizzol <caiopizzol@icloud.com>
Date: Mon, 8 Jun 2026 18:00:21 -0300
Subject: [PATCH 4/4] ci: skip fallbacks release without package changes

---
 .github/workflows/release.yml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4f27535..1e778db 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -25,17 +25,37 @@ jobs:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
+      - name: Check package changes
+        id: package_changes
+        run: |
+          latest_tag="$(git describe --tags --match 'v*' --abbrev=0 2>/dev/null || true)"
+          if [ -z "$latest_tag" ]; then
+            echo "changed=true" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+
+          if git diff --quiet "$latest_tag"..HEAD -- packages/fallbacks; then
+            echo "changed=false" >> "$GITHUB_OUTPUT"
+            echo "No packages/fallbacks changes since $latest_tag."
+          else
+            echo "changed=true" >> "$GITHUB_OUTPUT"
+          fi
       - uses: oven-sh/setup-bun@v2
+        if: steps.package_changes.outputs.changed == 'true'
         with:
           bun-version: 1.3.12
       - run: bun install --frozen-lockfile
+        if: steps.package_changes.outputs.changed == 'true'
       - run: bun run build
+        if: steps.package_changes.outputs.changed == 'true'
       - name: Clear Bun install tree before npm publish
+        if: steps.package_changes.outputs.changed == 'true'
         run: rm -rf node_modules packages/*/node_modules
       - env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
           ANTHROPIC_API_KEY_RELEASE_NOTES: ${{ secrets.ANTHROPIC_API_KEY_RELEASE_NOTES }}
+        if: steps.package_changes.outputs.changed == 'true'
         run: >
           npx --yes
           --package semantic-release@24