Skip to content

Commit 52e22eb

Browse files
committed
better pricing estimate
1 parent fe4e3c5 commit 52e22eb

2 files changed

Lines changed: 142 additions & 32 deletions

File tree

evals/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,21 @@ table; another reason to use a large-context endpoint). Use `--repeat >= 3` to
162162
average out model nondeterminism. Per-run detail is written to
163163
`out/schema_fields_eval.jsonl`.
164164

165+
> Authoritative billed cost: when `--base-url` is the Copilot API
166+
> (`https://api.githubcopilot.com`), every response carries a vendor
167+
> `copilot_usage.token_details` block with the **real per-type prices** the
168+
> billing system uses (input / cache_read / cache_write / output) plus a summed
169+
> `total_nano_aiu`. The harness reads it straight off each response and reports an
170+
> **authoritative billed cost in AIU** (AI credits — the native billing unit),
171+
> including the `cache_write` bucket that OpenAI-style usage never exposes. This is
172+
> the same source the Copilot agent runtime benchmarks use, so no hand-typed
173+
> prices are involved. For other endpoints (GitHub Models, OpenAI) that don't
174+
> return `copilot_usage`, it falls back to a flat per-1M estimate from
175+
> `--price-prompt` / `--price-cached` / `--price-completion`. A credit→USD rate is
176+
> account-specific and non-public, so the cost is reported in AIU by default; pass
177+
> `--aiu-to-usd <rate>` only if you know yours and want the billed-cost tables in
178+
> dollars.
179+
165180
> Task design matters: the default tasks are intentionally **neutral** (they do
166181
> not tell the model to "return only X"). Biasing prompts toward terse answers
167182
> would inflate the filtering arms. Keep a balanced mix of narrow/full/neutral.

evals/schema_fields_eval.py

Lines changed: 127 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,9 @@ def run_task(
234234
]
235235
prompt_tokens = completion_tokens = turns = tool_calls = fields_calls = 0
236236
cached_tokens = 0
237+
cache_write_tokens = 0
238+
billed_aiu_nano = 0.0
239+
saw_billing = False
237240
tool_errors = 0
238241
final_text = ""
239242
error = ""
@@ -264,6 +267,35 @@ def run_task(
264267
if details is not None:
265268
cached_tokens += getattr(details, "cached_tokens", 0) or 0
266269

270+
# AUTHORITATIVE billed cost (Copilot API only). The Copilot endpoint returns
271+
# a vendor `copilot_usage` block alongside the standard usage. The OpenAI SDK
272+
# drops unknown fields from the typed object but preserves them on
273+
# `model_extra`, so we read it there (it lives at the response root, not under
274+
# `usage`). `token_details` carries the real per-type prices the billing
275+
# system uses -- input / cache_read / cache_write / output -- as
276+
# cost_per_batch nano-AIU per batch_size tokens, plus a pre-summed
277+
# `total_nano_aiu`. Using these gives the exact billed cost (no hand-typed
278+
# prices) AND the cache_write bucket that OpenAI-style usage never reports.
279+
copilot_usage = (getattr(resp, "model_extra", None) or {}).get("copilot_usage")
280+
if isinstance(copilot_usage, dict):
281+
details_list = copilot_usage.get("token_details") or []
282+
for d in details_list:
283+
if d.get("token_type") == "cache_write":
284+
cache_write_tokens += d.get("token_count", 0) or 0
285+
total = copilot_usage.get("total_nano_aiu")
286+
if total is not None:
287+
billed_aiu_nano += total
288+
saw_billing = True
289+
elif details_list:
290+
# Fall back to summing per-type cost if the endpoint omits the total.
291+
for d in details_list:
292+
batch = d.get("batch_size", 0) or 0
293+
if batch:
294+
billed_aiu_nano += (d.get("token_count", 0) or 0) * (
295+
d.get("cost_per_batch", 0) or 0
296+
) / batch
297+
saw_billing = True
298+
267299
msg = resp.choices[0].message
268300
if not msg.tool_calls:
269301
# A text-only assistant message. Models like Claude often emit a
@@ -342,7 +374,9 @@ def run_task(
342374
return {
343375
"prompt_tokens": prompt_tokens,
344376
"cached_prompt_tokens": cached_tokens,
377+
"cache_write_tokens": cache_write_tokens,
345378
"completion_tokens": completion_tokens,
379+
"billed_aiu_nano": billed_aiu_nano if saw_billing else None,
346380
"turns": turns,
347381
"tool_calls": tool_calls,
348382
"fields_calls": fields_calls,
@@ -361,14 +395,18 @@ def summarize(
361395
price_prompt: float = 3.0,
362396
price_cached: float = 0.30,
363397
price_completion: float = 15.0,
398+
aiu_to_usd: float | None = None,
364399
) -> None:
365400
"""Print a 3-scenario comparison plus a per-task-type breakdown.
366401
367402
A task-run only counts toward the token comparison if ALL three arms
368403
succeeded for it, so every comparison is apples-to-apples.
369404
370405
Token types are billed differently, so we report prompt vs completion
371-
separately and apply per-type prices (per 1M tokens) for a true cost view.
406+
separately. When the endpoint is the Copilot API it returns the real per-type
407+
prices on every response (`copilot_usage.token_details`); we use those for an
408+
AUTHORITATIVE billed cost (in AIU, the actual billing unit). For other
409+
endpoints we fall back to the flat per-1M `--price-*` estimate.
372410
"""
373411
from collections import defaultdict
374412

@@ -434,10 +472,24 @@ def arm_prompt(arm: str) -> int:
434472
def arm_sum(arm: str, field: str) -> int:
435473
return sum(by_key[k][arm].get(field, 0) for k in valid)
436474

437-
def record_cost(r: dict) -> float:
438-
# Per-run USD cost from the three token types. OpenAI-style usage reports
439-
# cached tokens as a SUBSET of prompt_tokens, so bill the uncached remainder
440-
# at the input price and the cached part at the cheaper cache-read price.
475+
# AUTHORITATIVE vs ESTIMATED cost. If any run carries `billed_aiu_nano`, the
476+
# Copilot API gave us the real per-type billed cost, so we report that (in AIU,
477+
# the actual billing unit -- or USD if a credit rate is supplied). Otherwise we
478+
# fall back to the flat per-1M `--price-*` estimate for endpoints (GitHub Models,
479+
# OpenAI) that don't return `copilot_usage`.
480+
billing_mode = any(
481+
by_key[k][a].get("billed_aiu_nano") is not None for k in valid for a in ARM_ORDER
482+
)
483+
cost_unit = ("$" if aiu_to_usd else "AIU") if billing_mode else "$"
484+
cdp = 4 if cost_unit == "$" else 6 # cost decimal places
485+
486+
def cost_of(r: dict) -> float:
487+
if billing_mode:
488+
aiu = (r.get("billed_aiu_nano") or 0.0) / 1e9 # nano-AIU -> AIU
489+
return aiu * aiu_to_usd if aiu_to_usd else aiu
490+
# Flat-price fallback. OpenAI-style usage reports cached tokens as a SUBSET
491+
# of prompt_tokens, so bill the uncached remainder at the input price and the
492+
# cached part at the cheaper cache-read price.
441493
prompt = r.get("prompt_tokens", 0)
442494
cached = r.get("cached_prompt_tokens", 0)
443495
uncached = max(prompt - cached, 0)
@@ -447,19 +499,26 @@ def record_cost(r: dict) -> float:
447499
) / 1_000_000.0
448500

449501
def arm_cost(arm: str) -> float:
450-
return sum(record_cost(by_key[k][arm]) for k in valid)
502+
return sum(cost_of(by_key[k][arm]) for k in valid)
451503

452504
any_cached = any(arm_sum(a, "cached_prompt_tokens") for a in ARM_ORDER)
505+
any_cache_write = any(arm_sum(a, "cache_write_tokens") for a in ARM_ORDER)
453506
base_completion = arm_sum("baseline", "completion_tokens")
454507
print("\nPROMPT vs COMPLETION (cumulative over valid runs):")
455-
print(f" {'scenario':<24}{'prompt':>11}{'cached':>10}{'completion':>12}{'comp Δ% vs S1':>15}")
508+
cw_head = f"{'cache_wr':>10}" if any_cache_write else ""
509+
print(
510+
f" {'scenario':<24}{'prompt':>11}{'cached':>10}{cw_head}"
511+
f"{'completion':>12}{'comp Δ% vs S1':>15}"
512+
)
456513
for arm in ARM_ORDER:
457514
prompt = arm_sum(arm, "prompt_tokens")
458515
cached = arm_sum(arm, "cached_prompt_tokens")
459516
completion = arm_sum(arm, "completion_tokens")
460517
cpct = (100.0 * (completion - base_completion) / base_completion) if base_completion else 0.0
518+
cw_cell = f"{arm_sum(arm, 'cache_write_tokens'):>10}" if any_cache_write else ""
461519
print(
462-
f" {SCENARIO_LABEL[arm]:<24}{prompt:>11}{cached:>10}{completion:>12}{cpct:>+15.1f}"
520+
f" {SCENARIO_LABEL[arm]:<24}{prompt:>11}{cached:>10}{cw_cell}"
521+
f"{completion:>12}{cpct:>+15.1f}"
463522
)
464523
if not any_cached:
465524
print(
@@ -469,25 +528,43 @@ def arm_cost(arm: str) -> float:
469528
)
470529

471530
base_cost = arm_cost("baseline")
472-
print(
473-
f"\nESTIMATED COST (USD; prices per 1M tok -- prompt={price_prompt}, "
474-
f"cached={price_cached}, completion={price_completion}):"
475-
)
476-
print(f" {'scenario':<24}{'cost($)':>11}{'Δ vs S1':>12}{'Δ%':>9}")
531+
if billing_mode:
532+
unit_note = (
533+
f"USD via --aiu-to-usd={aiu_to_usd}"
534+
if aiu_to_usd
535+
else "AIU = AI credits; pass --aiu-to-usd to convert"
536+
)
537+
print(f"\nBILLED COST (authoritative, from Copilot token_details; {unit_note}):")
538+
else:
539+
print(
540+
f"\nESTIMATED COST (USD; prices per 1M tok -- prompt={price_prompt}, "
541+
f"cached={price_cached}, completion={price_completion}):"
542+
)
543+
print(f" {'scenario':<24}{'cost(' + cost_unit + ')':>11}{'Δ vs S1':>12}{'Δ%':>9}")
477544
for arm in ARM_ORDER:
478545
cost = arm_cost(arm)
479546
delta = cost - base_cost
480547
cpct = (100.0 * delta / base_cost) if base_cost else 0.0
481548
sign = "+" if delta >= 0 else ""
482549
print(
483-
f" {SCENARIO_LABEL[arm]:<24}{cost:>11.4f}{sign + f'{delta:.4f}':>12}{sign + f'{cpct:.1f}':>9}"
550+
f" {SCENARIO_LABEL[arm]:<24}{cost:>11.{cdp}f}"
551+
f"{sign + f'{delta:.{cdp}f}':>12}{sign + f'{cpct:.1f}':>9}"
552+
)
553+
if billing_mode:
554+
print(
555+
" (Real per-type prices the billing system used -- input, cache_read,\n"
556+
" cache_write, output -- summed straight from each response's\n"
557+
" copilot_usage.token_details. AIU is the native unit (1 AIU = 1e9\n"
558+
" nano-AIU); a credit->USD rate is account-specific, so pass --aiu-to-usd\n"
559+
" only if you know yours.)"
560+
)
561+
else:
562+
print(
563+
" (Input/prompt tokens are far cheaper than output/completion tokens, and\n"
564+
" cached prompt tokens cheaper still. `fields` saves mostly PROMPT tokens,\n"
565+
" so the COST win is real but smaller than the raw token-count delta. Pass\n"
566+
" your model's real prices via --price-prompt/--price-cached/--price-completion.)"
484567
)
485-
print(
486-
" (Input/prompt tokens are far cheaper than output/completion tokens, and\n"
487-
" cached prompt tokens cheaper still. `fields` saves mostly PROMPT tokens,\n"
488-
" so the COST win is real but smaller than the raw token-count delta. Pass\n"
489-
" your model's real prices via --price-prompt/--price-cached/--price-completion.)"
490-
)
491568

492569
# Where does the benefit live? Mean prompt tokens per task-run, by task type.
493570
tags = sorted({by_key[k]["baseline"]["tag"] for k in valid})
@@ -523,7 +600,7 @@ def task_arm_mean(task: str, arm: str) -> float:
523600

524601
def task_arm_cost(task: str, arm: str) -> float:
525602
ks = per_task[task]
526-
return sum(record_cost(by_key[k][arm]) for k in ks) / len(ks)
603+
return sum(cost_of(by_key[k][arm]) for k in ks) / len(ks)
527604

528605
print("\nPER-TASK (mean prompt tokens per run; Δ% vs S1, negative = cheaper):")
529606
print(f" {'tag':<8}{'S1':>9}{'S3':>9}{'S2':>9}{'S3 Δ%':>8}{'S2 Δ%':>8} task")
@@ -558,25 +635,32 @@ def task_arm_cost(task: str, arm: str) -> float:
558635
" frequently-run ones -- counts equally, better reflecting a typical mix.)"
559636
)
560637

561-
# ---- Cost views (USD) ----------------------------------------------------
562-
# The cumulative ESTIMATED COST table above is dominated by a few heavy tasks.
563-
# These per-task and equal-weight cost views mirror the token views but in
564-
# dollars, so a small frequently-run task carries the same weight as a giant one.
565-
print("\nPER-TASK COST (mean USD per run; Δ% vs S1, negative = cheaper):")
566-
print(f" {'tag':<8}{'S1$':>9}{'S3$':>9}{'S2$':>9}{'S3 Δ%':>8}{'S2 Δ%':>8} task")
638+
# ---- Cost views ----------------------------------------------------------
639+
# The cumulative cost table above is dominated by a few heavy tasks. These
640+
# per-task and equal-weight cost views mirror the token views but in money
641+
# (authoritative AIU/USD when available, else the flat estimate), so a small
642+
# frequently-run task carries the same weight as a giant one.
643+
print(f"\nPER-TASK COST (mean {cost_unit} per run; Δ% vs S1, negative = cheaper):")
644+
print(
645+
f" {'tag':<8}{'S1' + cost_unit:>11}{'S3' + cost_unit:>11}{'S2' + cost_unit:>11}"
646+
f"{'S3 Δ%':>8}{'S2 Δ%':>8} task"
647+
)
567648
for task in sorted(per_task, key=lambda t: task_arm_cost(t, "baseline"), reverse=True):
568649
c1 = task_arm_cost(task, "baseline")
569650
c3 = task_arm_cost(task, "fields_only")
570651
c2 = task_arm_cost(task, "schema_fields")
571652
c3p = 100.0 * (c3 - c1) / c1 if c1 else 0.0
572653
c2p = 100.0 * (c2 - c1) / c1 if c1 else 0.0
573654
print(
574-
f" {task_tag[task]:<8}{c1:>9.4f}{c3:>9.4f}{c2:>9.4f}"
655+
f" {task_tag[task]:<8}{c1:>11.{cdp}f}{c3:>11.{cdp}f}{c2:>11.{cdp}f}"
575656
f"{c3p:>+8.1f}{c2p:>+8.1f} {task[:60]}"
576657
)
577658

578-
print("\nEQUAL-WEIGHT COST ACROSS TASKS (each task counts once, USD):")
579-
print(f" {'scenario':<24}{'mean$/task':>11}{'median$/task':>14}{'mean Δ% vs S1':>15}")
659+
print(f"\nEQUAL-WEIGHT COST ACROSS TASKS (each task counts once, {cost_unit}):")
660+
print(
661+
f" {'scenario':<24}{'mean ' + cost_unit + '/task':>14}"
662+
f"{'median ' + cost_unit + '/task':>16}{'mean Δ% vs S1':>15}"
663+
)
580664
for arm in ARM_ORDER:
581665
per_task_costs = [task_arm_cost(t, arm) for t in per_task]
582666
eq_mean = mean(per_task_costs)
@@ -587,9 +671,9 @@ def task_arm_cost(task: str, arm: str) -> float:
587671
if task_arm_cost(t, "baseline")
588672
]
589673
eq_pct = mean(pct) if pct else 0.0
590-
print(f" {SCENARIO_LABEL[arm]:<24}{eq_mean:>11.4f}{eq_med:>14.4f}{eq_pct:>+15.1f}")
674+
print(f" {SCENARIO_LABEL[arm]:<24}{eq_mean:>14.{cdp}f}{eq_med:>16.{cdp}f}{eq_pct:>+15.1f}")
591675
print(
592-
" (Dollar view of the equal-weight table: every task counts once, so the\n"
676+
" (Cost view of the equal-weight table: every task counts once, so the\n"
593677
" figure reflects a typical task mix rather than the few heaviest tasks.)"
594678
)
595679

@@ -859,6 +943,16 @@ def main() -> int:
859943
help="USD per 1M completion (output) tokens. Default is a Claude-class "
860944
"placeholder (~5x input); pass your model's real output price.",
861945
)
946+
parser.add_argument(
947+
"--aiu-to-usd",
948+
type=float,
949+
default=None,
950+
help="Optional AIU->USD rate. The Copilot endpoint returns the real billed "
951+
"cost in AIU (AI credits); the eval reports that authoritative cost directly. "
952+
"Pass this only if you know your account's credit->USD rate and want the "
953+
"billed-cost tables converted to dollars. Ignored for non-Copilot endpoints, "
954+
"which use the flat --price-* estimate instead.",
955+
)
862956
parser.add_argument(
863957
"--rate-limit-retries",
864958
type=int,
@@ -1001,6 +1095,7 @@ def build_client():
10011095
price_prompt=args.price_prompt,
10021096
price_cached=args.price_cached,
10031097
price_completion=args.price_completion,
1098+
aiu_to_usd=args.aiu_to_usd,
10041099
)
10051100
return 0
10061101

0 commit comments

Comments
 (0)