@@ -234,6 +234,9 @@ def run_task(
234234 ]
235235 prompt_tokens = completion_tokens = turns = tool_calls = fields_calls = 0
236236 cached_tokens = 0
237+ cache_write_tokens = 0
238+ billed_aiu_nano = 0.0
239+ saw_billing = False
237240 tool_errors = 0
238241 final_text = ""
239242 error = ""
@@ -264,6 +267,35 @@ def run_task(
264267 if details is not None :
265268 cached_tokens += getattr (details , "cached_tokens" , 0 ) or 0
266269
270+ # AUTHORITATIVE billed cost (Copilot API only). The Copilot endpoint returns
271+ # a vendor `copilot_usage` block alongside the standard usage. The OpenAI SDK
272+ # drops unknown fields from the typed object but preserves them on
273+ # `model_extra`, so we read it there (it lives at the response root, not under
274+ # `usage`). `token_details` carries the real per-type prices the billing
275+ # system uses -- input / cache_read / cache_write / output -- as
276+ # cost_per_batch nano-AIU per batch_size tokens, plus a pre-summed
277+ # `total_nano_aiu`. Using these gives the exact billed cost (no hand-typed
278+ # prices) AND the cache_write bucket that OpenAI-style usage never reports.
279+ copilot_usage = (getattr (resp , "model_extra" , None ) or {}).get ("copilot_usage" )
280+ if isinstance (copilot_usage , dict ):
281+ details_list = copilot_usage .get ("token_details" ) or []
282+ for d in details_list :
283+ if d .get ("token_type" ) == "cache_write" :
284+ cache_write_tokens += d .get ("token_count" , 0 ) or 0
285+ total = copilot_usage .get ("total_nano_aiu" )
286+ if total is not None :
287+ billed_aiu_nano += total
288+ saw_billing = True
289+ elif details_list :
290+ # Fall back to summing per-type cost if the endpoint omits the total.
291+ for d in details_list :
292+ batch = d .get ("batch_size" , 0 ) or 0
293+ if batch :
294+ billed_aiu_nano += (d .get ("token_count" , 0 ) or 0 ) * (
295+ d .get ("cost_per_batch" , 0 ) or 0
296+ ) / batch
297+ saw_billing = True
298+
267299 msg = resp .choices [0 ].message
268300 if not msg .tool_calls :
269301 # A text-only assistant message. Models like Claude often emit a
@@ -342,7 +374,9 @@ def run_task(
342374 return {
343375 "prompt_tokens" : prompt_tokens ,
344376 "cached_prompt_tokens" : cached_tokens ,
377+ "cache_write_tokens" : cache_write_tokens ,
345378 "completion_tokens" : completion_tokens ,
379+ "billed_aiu_nano" : billed_aiu_nano if saw_billing else None ,
346380 "turns" : turns ,
347381 "tool_calls" : tool_calls ,
348382 "fields_calls" : fields_calls ,
@@ -361,14 +395,18 @@ def summarize(
361395 price_prompt : float = 3.0 ,
362396 price_cached : float = 0.30 ,
363397 price_completion : float = 15.0 ,
398+ aiu_to_usd : float | None = None ,
364399) -> None :
365400 """Print a 3-scenario comparison plus a per-task-type breakdown.
366401
367402 A task-run only counts toward the token comparison if ALL three arms
368403 succeeded for it, so every comparison is apples-to-apples.
369404
370405 Token types are billed differently, so we report prompt vs completion
371- separately and apply per-type prices (per 1M tokens) for a true cost view.
406+ separately. When the endpoint is the Copilot API it returns the real per-type
407+ prices on every response (`copilot_usage.token_details`); we use those for an
408+ AUTHORITATIVE billed cost (in AIU, the actual billing unit). For other
409+ endpoints we fall back to the flat per-1M `--price-*` estimate.
372410 """
373411 from collections import defaultdict
374412
@@ -434,10 +472,24 @@ def arm_prompt(arm: str) -> int:
434472 def arm_sum (arm : str , field : str ) -> int :
435473 return sum (by_key [k ][arm ].get (field , 0 ) for k in valid )
436474
437- def record_cost (r : dict ) -> float :
438- # Per-run USD cost from the three token types. OpenAI-style usage reports
439- # cached tokens as a SUBSET of prompt_tokens, so bill the uncached remainder
440- # at the input price and the cached part at the cheaper cache-read price.
475+ # AUTHORITATIVE vs ESTIMATED cost. If any run carries `billed_aiu_nano`, the
476+ # Copilot API gave us the real per-type billed cost, so we report that (in AIU,
477+ # the actual billing unit -- or USD if a credit rate is supplied). Otherwise we
478+ # fall back to the flat per-1M `--price-*` estimate for endpoints (GitHub Models,
479+ # OpenAI) that don't return `copilot_usage`.
480+ billing_mode = any (
481+ by_key [k ][a ].get ("billed_aiu_nano" ) is not None for k in valid for a in ARM_ORDER
482+ )
483+ cost_unit = ("$" if aiu_to_usd else "AIU" ) if billing_mode else "$"
484+ cdp = 4 if cost_unit == "$" else 6 # cost decimal places
485+
486+ def cost_of (r : dict ) -> float :
487+ if billing_mode :
488+ aiu = (r .get ("billed_aiu_nano" ) or 0.0 ) / 1e9 # nano-AIU -> AIU
489+ return aiu * aiu_to_usd if aiu_to_usd else aiu
490+ # Flat-price fallback. OpenAI-style usage reports cached tokens as a SUBSET
491+ # of prompt_tokens, so bill the uncached remainder at the input price and the
492+ # cached part at the cheaper cache-read price.
441493 prompt = r .get ("prompt_tokens" , 0 )
442494 cached = r .get ("cached_prompt_tokens" , 0 )
443495 uncached = max (prompt - cached , 0 )
@@ -447,19 +499,26 @@ def record_cost(r: dict) -> float:
447499 ) / 1_000_000.0
448500
449501 def arm_cost (arm : str ) -> float :
450- return sum (record_cost (by_key [k ][arm ]) for k in valid )
502+ return sum (cost_of (by_key [k ][arm ]) for k in valid )
451503
452504 any_cached = any (arm_sum (a , "cached_prompt_tokens" ) for a in ARM_ORDER )
505+ any_cache_write = any (arm_sum (a , "cache_write_tokens" ) for a in ARM_ORDER )
453506 base_completion = arm_sum ("baseline" , "completion_tokens" )
454507 print ("\n PROMPT vs COMPLETION (cumulative over valid runs):" )
455- print (f" { 'scenario' :<24} { 'prompt' :>11} { 'cached' :>10} { 'completion' :>12} { 'comp Δ% vs S1' :>15} " )
508+ cw_head = f"{ 'cache_wr' :>10} " if any_cache_write else ""
509+ print (
510+ f" { 'scenario' :<24} { 'prompt' :>11} { 'cached' :>10} { cw_head } "
511+ f"{ 'completion' :>12} { 'comp Δ% vs S1' :>15} "
512+ )
456513 for arm in ARM_ORDER :
457514 prompt = arm_sum (arm , "prompt_tokens" )
458515 cached = arm_sum (arm , "cached_prompt_tokens" )
459516 completion = arm_sum (arm , "completion_tokens" )
460517 cpct = (100.0 * (completion - base_completion ) / base_completion ) if base_completion else 0.0
518+ cw_cell = f"{ arm_sum (arm , 'cache_write_tokens' ):>10} " if any_cache_write else ""
461519 print (
462- f" { SCENARIO_LABEL [arm ]:<24} { prompt :>11} { cached :>10} { completion :>12} { cpct :>+15.1f} "
520+ f" { SCENARIO_LABEL [arm ]:<24} { prompt :>11} { cached :>10} { cw_cell } "
521+ f"{ completion :>12} { cpct :>+15.1f} "
463522 )
464523 if not any_cached :
465524 print (
@@ -469,25 +528,43 @@ def arm_cost(arm: str) -> float:
469528 )
470529
471530 base_cost = arm_cost ("baseline" )
472- print (
473- f"\n ESTIMATED COST (USD; prices per 1M tok -- prompt={ price_prompt } , "
474- f"cached={ price_cached } , completion={ price_completion } ):"
475- )
476- print (f" { 'scenario' :<24} { 'cost($)' :>11} { 'Δ vs S1' :>12} { 'Δ%' :>9} " )
531+ if billing_mode :
532+ unit_note = (
533+ f"USD via --aiu-to-usd={ aiu_to_usd } "
534+ if aiu_to_usd
535+ else "AIU = AI credits; pass --aiu-to-usd to convert"
536+ )
537+ print (f"\n BILLED COST (authoritative, from Copilot token_details; { unit_note } ):" )
538+ else :
539+ print (
540+ f"\n ESTIMATED COST (USD; prices per 1M tok -- prompt={ price_prompt } , "
541+ f"cached={ price_cached } , completion={ price_completion } ):"
542+ )
543+ print (f" { 'scenario' :<24} { 'cost(' + cost_unit + ')' :>11} { 'Δ vs S1' :>12} { 'Δ%' :>9} " )
477544 for arm in ARM_ORDER :
478545 cost = arm_cost (arm )
479546 delta = cost - base_cost
480547 cpct = (100.0 * delta / base_cost ) if base_cost else 0.0
481548 sign = "+" if delta >= 0 else ""
482549 print (
483- f" { SCENARIO_LABEL [arm ]:<24} { cost :>11.4f} { sign + f'{ delta :.4f} ' :>12} { sign + f'{ cpct :.1f} ' :>9} "
550+ f" { SCENARIO_LABEL [arm ]:<24} { cost :>11.{cdp }f} "
551+ f"{ sign + f'{ delta :.{cdp }f} ' :>12} { sign + f'{ cpct :.1f} ' :>9} "
552+ )
553+ if billing_mode :
554+ print (
555+ " (Real per-type prices the billing system used -- input, cache_read,\n "
556+ " cache_write, output -- summed straight from each response's\n "
557+ " copilot_usage.token_details. AIU is the native unit (1 AIU = 1e9\n "
558+ " nano-AIU); a credit->USD rate is account-specific, so pass --aiu-to-usd\n "
559+ " only if you know yours.)"
560+ )
561+ else :
562+ print (
563+ " (Input/prompt tokens are far cheaper than output/completion tokens, and\n "
564+ " cached prompt tokens cheaper still. `fields` saves mostly PROMPT tokens,\n "
565+ " so the COST win is real but smaller than the raw token-count delta. Pass\n "
566+ " your model's real prices via --price-prompt/--price-cached/--price-completion.)"
484567 )
485- print (
486- " (Input/prompt tokens are far cheaper than output/completion tokens, and\n "
487- " cached prompt tokens cheaper still. `fields` saves mostly PROMPT tokens,\n "
488- " so the COST win is real but smaller than the raw token-count delta. Pass\n "
489- " your model's real prices via --price-prompt/--price-cached/--price-completion.)"
490- )
491568
492569 # Where does the benefit live? Mean prompt tokens per task-run, by task type.
493570 tags = sorted ({by_key [k ]["baseline" ]["tag" ] for k in valid })
@@ -523,7 +600,7 @@ def task_arm_mean(task: str, arm: str) -> float:
523600
524601 def task_arm_cost (task : str , arm : str ) -> float :
525602 ks = per_task [task ]
526- return sum (record_cost (by_key [k ][arm ]) for k in ks ) / len (ks )
603+ return sum (cost_of (by_key [k ][arm ]) for k in ks ) / len (ks )
527604
528605 print ("\n PER-TASK (mean prompt tokens per run; Δ% vs S1, negative = cheaper):" )
529606 print (f" { 'tag' :<8} { 'S1' :>9} { 'S3' :>9} { 'S2' :>9} { 'S3 Δ%' :>8} { 'S2 Δ%' :>8} task" )
@@ -558,25 +635,32 @@ def task_arm_cost(task: str, arm: str) -> float:
558635 " frequently-run ones -- counts equally, better reflecting a typical mix.)"
559636 )
560637
561- # ---- Cost views (USD) ----------------------------------------------------
562- # The cumulative ESTIMATED COST table above is dominated by a few heavy tasks.
563- # These per-task and equal-weight cost views mirror the token views but in
564- # dollars, so a small frequently-run task carries the same weight as a giant one.
565- print ("\n PER-TASK COST (mean USD per run; Δ% vs S1, negative = cheaper):" )
566- print (f" { 'tag' :<8} { 'S1$' :>9} { 'S3$' :>9} { 'S2$' :>9} { 'S3 Δ%' :>8} { 'S2 Δ%' :>8} task" )
638+ # ---- Cost views ----------------------------------------------------------
639+ # The cumulative cost table above is dominated by a few heavy tasks. These
640+ # per-task and equal-weight cost views mirror the token views but in money
641+ # (authoritative AIU/USD when available, else the flat estimate), so a small
642+ # frequently-run task carries the same weight as a giant one.
643+ print (f"\n PER-TASK COST (mean { cost_unit } per run; Δ% vs S1, negative = cheaper):" )
644+ print (
645+ f" { 'tag' :<8} { 'S1' + cost_unit :>11} { 'S3' + cost_unit :>11} { 'S2' + cost_unit :>11} "
646+ f"{ 'S3 Δ%' :>8} { 'S2 Δ%' :>8} task"
647+ )
567648 for task in sorted (per_task , key = lambda t : task_arm_cost (t , "baseline" ), reverse = True ):
568649 c1 = task_arm_cost (task , "baseline" )
569650 c3 = task_arm_cost (task , "fields_only" )
570651 c2 = task_arm_cost (task , "schema_fields" )
571652 c3p = 100.0 * (c3 - c1 ) / c1 if c1 else 0.0
572653 c2p = 100.0 * (c2 - c1 ) / c1 if c1 else 0.0
573654 print (
574- f" { task_tag [task ]:<8} { c1 :>9.4f } { c3 :>9.4f } { c2 :>9.4f } "
655+ f" { task_tag [task ]:<8} { c1 :>11.{ cdp }f } { c3 :>11.{ cdp }f } { c2 :>11.{ cdp }f } "
575656 f"{ c3p :>+8.1f} { c2p :>+8.1f} { task [:60 ]} "
576657 )
577658
578- print ("\n EQUAL-WEIGHT COST ACROSS TASKS (each task counts once, USD):" )
579- print (f" { 'scenario' :<24} { 'mean$/task' :>11} { 'median$/task' :>14} { 'mean Δ% vs S1' :>15} " )
659+ print (f"\n EQUAL-WEIGHT COST ACROSS TASKS (each task counts once, { cost_unit } ):" )
660+ print (
661+ f" { 'scenario' :<24} { 'mean ' + cost_unit + '/task' :>14} "
662+ f"{ 'median ' + cost_unit + '/task' :>16} { 'mean Δ% vs S1' :>15} "
663+ )
580664 for arm in ARM_ORDER :
581665 per_task_costs = [task_arm_cost (t , arm ) for t in per_task ]
582666 eq_mean = mean (per_task_costs )
@@ -587,9 +671,9 @@ def task_arm_cost(task: str, arm: str) -> float:
587671 if task_arm_cost (t , "baseline" )
588672 ]
589673 eq_pct = mean (pct ) if pct else 0.0
590- print (f" { SCENARIO_LABEL [arm ]:<24} { eq_mean :>11.4f } { eq_med :>14.4f } { eq_pct :>+15.1f} " )
674+ print (f" { SCENARIO_LABEL [arm ]:<24} { eq_mean :>14.{ cdp }f } { eq_med :>16.{ cdp }f } { eq_pct :>+15.1f} " )
591675 print (
592- " (Dollar view of the equal-weight table: every task counts once, so the\n "
676+ " (Cost view of the equal-weight table: every task counts once, so the\n "
593677 " figure reflects a typical task mix rather than the few heaviest tasks.)"
594678 )
595679
@@ -859,6 +943,16 @@ def main() -> int:
859943 help = "USD per 1M completion (output) tokens. Default is a Claude-class "
860944 "placeholder (~5x input); pass your model's real output price." ,
861945 )
946+ parser .add_argument (
947+ "--aiu-to-usd" ,
948+ type = float ,
949+ default = None ,
950+ help = "Optional AIU->USD rate. The Copilot endpoint returns the real billed "
951+ "cost in AIU (AI credits); the eval reports that authoritative cost directly. "
952+ "Pass this only if you know your account's credit->USD rate and want the "
953+ "billed-cost tables converted to dollars. Ignored for non-Copilot endpoints, "
954+ "which use the flat --price-* estimate instead." ,
955+ )
862956 parser .add_argument (
863957 "--rate-limit-retries" ,
864958 type = int ,
@@ -1001,6 +1095,7 @@ def build_client():
10011095 price_prompt = args .price_prompt ,
10021096 price_cached = args .price_cached ,
10031097 price_completion = args .price_completion ,
1098+ aiu_to_usd = args .aiu_to_usd ,
10041099 )
10051100 return 0
10061101
0 commit comments