6666from codeflash .result .critic import coverage_critic , performance_gain , quantity_of_tests_critic , speedup_critic
6767from codeflash .result .explanation import Explanation
6868from codeflash .telemetry .posthog_cf import ph
69+ from codeflash .verification .bayesian_analysis import compare_function_runtime_distributions
6970from codeflash .verification .concolic_testing import generate_concolic_tests
7071from codeflash .verification .equivalence import compare_test_results
7172from codeflash .verification .parse_test_output import parse_test_results
7778if TYPE_CHECKING :
7879 from argparse import Namespace
7980
81+ import numpy as np
82+ import numpy .typing as npt
83+
8084 from codeflash .either import Result
8185 from codeflash .models .models import CoverageData , FunctionSource , OptimizedCandidate
8286
@@ -352,7 +356,12 @@ def optimize_function(
352356 cleanup_paths (paths_to_cleanup )
353357 return Failure (baseline_result .failure ())
354358
355- original_code_baseline , test_functions_to_remove = baseline_result .unwrap ()
359+ (
360+ original_code_baseline ,
361+ original_code_runtime_distribution ,
362+ original_code_runtime_statistics ,
363+ test_functions_to_remove ,
364+ ) = baseline_result .unwrap ()
356365 if isinstance (original_code_baseline , OriginalCodeBaseline ) and not coverage_critic (
357366 original_code_baseline .coverage_results , self .args .test_framework
358367 ):
@@ -371,6 +380,7 @@ def optimize_function(
371380 function_to_optimize = function_to_optimize ,
372381 original_code = validated_original_code [function_to_optimize .file_path ].source_code ,
373382 original_code_baseline = original_code_baseline ,
383+ original_code_runtime_distribution = original_code_runtime_distribution ,
374384 original_helper_code = original_helper_code ,
375385 function_trace_id = function_trace_id [:- 4 ] + f"EXP{ u } " if should_run_experiment else function_trace_id ,
376386 )
@@ -480,11 +490,13 @@ def determine_best_candidate(
480490 function_to_optimize : FunctionToOptimize ,
481491 original_code : str ,
482492 original_code_baseline : OriginalCodeBaseline ,
493+ original_code_runtime_distribution : npt .NDArray [np .float64 ],
483494 original_helper_code : dict [Path , str ],
484495 function_trace_id : str ,
485496 ) -> BestOptimization | None :
486497 best_optimization : BestOptimization | None = None
487498 best_runtime_until_now = original_code_baseline .runtime
499+ best_speedup_ratio_until_now = 1.0
488500
489501 speedup_ratios : dict [str , float | None ] = {}
490502 optimized_runtimes : dict [str , float | None ] = {}
@@ -528,7 +540,9 @@ def determine_best_candidate(
528540 is_correct [candidate .optimization_id ] = False
529541 speedup_ratios [candidate .optimization_id ] = None
530542 else :
531- candidate_result : OptimizedCandidateResult = run_results .unwrap ()
543+ candidate_result , candidate_runtime_distribution , candidate_runtime_statistics = (
544+ run_results .unwrap ()
545+ )
532546 best_test_runtime = candidate_result .best_test_runtime
533547 optimized_runtimes [candidate .optimization_id ] = best_test_runtime
534548 is_correct [candidate .optimization_id ] = True
@@ -537,18 +551,23 @@ def determine_best_candidate(
537551 )
538552 speedup_ratios [candidate .optimization_id ] = perf_gain
539553
540- tree = Tree (f"Candidate #{ candidate_index } - Runtime Information" )
554+ speedup_stats = compare_function_runtime_distributions (
555+ original_code_runtime_distribution , candidate_runtime_distribution
556+ )
557+
558+ tree = Tree (f"Candidate #{ candidate_index } - Sum of Minimum Runtimes" )
541559 if speedup_critic (
542560 candidate_result , original_code_baseline .runtime , best_runtime_until_now
543561 ) and quantity_of_tests_critic (candidate_result ):
544562 tree .add ("This candidate is faster than the previous best candidate. 🚀" )
545- tree .add (f"Original runtime: { humanize_runtime (original_code_baseline .runtime )} " )
563+ tree .add (f"Original summed runtime: { humanize_runtime (original_code_baseline .runtime )} " )
546564 tree .add (
547- f"Best test runtime: { humanize_runtime (candidate_result .best_test_runtime )} "
565+ f"Best summed runtime: { humanize_runtime (candidate_result .best_test_runtime )} "
548566 f"(measured over { candidate_result .max_loop_count } "
549567 f"loop{ 's' if candidate_result .max_loop_count > 1 else '' } )"
550568 )
551- tree .add (f"Speedup ratio: { perf_gain :.3f} " )
569+ tree .add (f"Speedup percentage: { perf_gain * 100 :.1f} %" )
570+ tree .add (f"Speedup ratio: { perf_gain + 1 :.1f} X" )
552571
553572 best_optimization = BestOptimization (
554573 candidate = candidate ,
@@ -560,11 +579,35 @@ def determine_best_candidate(
560579 best_runtime_until_now = best_test_runtime
561580 else :
562581 tree .add (
563- f"Runtime : { humanize_runtime (best_test_runtime )} "
582+ f"Summed runtime : { humanize_runtime (best_test_runtime )} "
564583 f"(measured over { candidate_result .max_loop_count } "
565584 f"loop{ 's' if candidate_result .max_loop_count > 1 else '' } )"
566585 )
567- tree .add (f"Speedup ratio: { perf_gain :.3f} " )
586+ tree .add (f"Speedup percentage: { perf_gain * 100 :.1f} %" )
587+ tree .add (f"Speedup ratio: { perf_gain + 1 :.3f} X" )
588+ console .print (tree )
589+ console .rule ()
590+
591+ tree = Tree (f"Candidate #{ candidate_index } - Bayesian Bootstrapping Nonparametric Analysis" )
592+ tree .add (
593+ f"Expected candidate runtime (95% Credible Interval) = ["
594+ f"{ humanize_runtime (candidate_runtime_statistics ['credible_interval_lower_bound' ])} , "
595+ f"{ humanize_runtime (candidate_runtime_statistics ['credible_interval_upper_bound' ])} ], "
596+ f"\n median = { humanize_runtime (candidate_runtime_statistics ['median' ])} "
597+ f"\n Speedup ratio of candidate vs original:"
598+ f"\n 95% Credible Interval = [{ speedup_stats ['credible_interval_lower_bound' ]:.3f} X, "
599+ f"{ speedup_stats ['credible_interval_upper_bound' ]:.3f} X]"
600+ f"\n median = { speedup_stats ['median' ]:.3f} X"
601+ )
602+ if speedup_stats ["credible_interval_lower_bound" ] > 1.0 :
603+ tree .add ("The candidate is faster than the original code with a 95% probability." )
604+ if speedup_stats ["median" ] > best_speedup_ratio_until_now :
605+ best_speedup_ratio_until_now = speedup_stats ["median" ]
606+ tree .add ("This candidate is the best candidate so far." )
607+ else :
608+ tree .add ("This candidate is not faster than the current fastest candidate." )
609+ else :
610+ tree .add ("It is inconclusive whether the candidate is faster than the original code." )
568611 console .print (tree )
569612 console .rule ()
570613
@@ -941,7 +984,7 @@ def generate_tests_and_optimizations(
941984
942985 def establish_original_code_baseline (
943986 self , function_name : str , function_file_path : Path , code_context : CodeOptimizationContext
944- ) -> Result [tuple [OriginalCodeBaseline , list [str ]], str ]:
987+ ) -> Result [tuple [OriginalCodeBaseline , npt . NDArray [ np . float64 ], dict [ str , np . float64 ], list [str ]], str ]:
945988 # For the original function - run the tests and get the runtime, plus coverage
946989 with progress_bar (f"Establishing original code baseline for { function_name } " ):
947990 assert (test_framework := self .args .test_framework ) in ["pytest" , "unittest" ]
@@ -1011,7 +1054,9 @@ def establish_original_code_baseline(
10111054 console .rule ()
10121055
10131056 total_timing = benchmarking_results .total_passed_runtime () # caution: doesn't handle the loop index
1014-
1057+ runtime_distribution , runtime_statistics = benchmarking_results .bayesian_nonparametric_bootstrap_analysis (
1058+ 100_000
1059+ )
10151060 functions_to_remove = [
10161061 result .id .test_function_name
10171062 for result in behavioral_results
@@ -1025,7 +1070,9 @@ def establish_original_code_baseline(
10251070 console .rule ()
10261071 success = False
10271072 if total_timing == 0 :
1028- logger .warning ("The overall test runtime of the original function is 0, couldn't run tests." )
1073+ logger .warning (
1074+ "The overall summed benchmark runtime of the original function is 0, couldn't run tests."
1075+ )
10291076 console .rule ()
10301077 success = False
10311078 if not total_timing :
@@ -1037,11 +1084,20 @@ def establish_original_code_baseline(
10371084
10381085 loop_count = max ([int (result .loop_index ) for result in benchmarking_results .test_results ])
10391086 logger .info (
1040- f"Original code runtime measured over { loop_count } loop{ 's' if loop_count > 1 else '' } : "
1087+ f"Original code summed runtime measured over { loop_count } loop{ 's' if loop_count > 1 else '' } : "
10411088 f"{ humanize_runtime (total_timing )} per full loop"
10421089 )
10431090 console .rule ()
1044- logger .debug (f"Total original code runtime (ns): { total_timing } " )
1091+ logger .debug (f"Total original code summed runtime (ns): { total_timing } " )
1092+ console .rule ()
1093+ logger .info (
1094+ f"Bayesian Bootstrapping Nonparametric Analysis"
1095+ f"\n Expected original code runtime (95% Credible Interval) = ["
1096+ f"{ humanize_runtime (round (runtime_statistics ['credible_interval_lower_bound' ]))} , "
1097+ f"{ humanize_runtime (round (runtime_statistics ['credible_interval_upper_bound' ]))} ], "
1098+ f"\n median: { humanize_runtime (round (runtime_statistics ['median' ]))} "
1099+ )
1100+
10451101 return Success (
10461102 (
10471103 OriginalCodeBaseline (
@@ -1050,13 +1106,15 @@ def establish_original_code_baseline(
10501106 runtime = total_timing ,
10511107 coverage_results = coverage_results ,
10521108 ),
1109+ runtime_distribution ,
1110+ runtime_statistics ,
10531111 functions_to_remove ,
10541112 )
10551113 )
10561114
10571115 def run_optimized_candidate (
10581116 self , * , optimization_candidate_index : int , baseline_results : OriginalCodeBaseline
1059- ) -> Result [OptimizedCandidateResult , str ]:
1117+ ) -> Result [tuple [ OptimizedCandidateResult , npt . NDArray [ np . float64 ], dict [ str , np . float64 ]] , str ]:
10601118 assert (test_framework := self .args .test_framework ) in ["pytest" , "unittest" ]
10611119
10621120 with progress_bar ("Testing optimization candidate" ):
@@ -1138,16 +1196,30 @@ def run_optimized_candidate(
11381196 if (total_candidate_timing := candidate_benchmarking_results .total_passed_runtime ()) == 0 :
11391197 logger .warning ("The overall test runtime of the optimized function is 0, couldn't run tests." )
11401198 console .rule ()
1199+ runtime_distribution , runtime_statistics = (
1200+ candidate_benchmarking_results .bayesian_nonparametric_bootstrap_analysis (100_000 )
1201+ )
11411202
11421203 logger .debug (f"Total optimized code { optimization_candidate_index } runtime (ns): { total_candidate_timing } " )
1204+ console .rule ()
1205+ logger .debug (
1206+ f"Overall code runtime (95% Credible Interval) = ["
1207+ f"{ humanize_runtime (round (runtime_statistics ['credible_interval_lower_bound' ]))} , "
1208+ f"{ humanize_runtime (round (runtime_statistics ['credible_interval_upper_bound' ]))} ], median: "
1209+ f"{ humanize_runtime (round (runtime_statistics ['median' ]))} "
1210+ )
11431211 return Success (
1144- OptimizedCandidateResult (
1145- max_loop_count = loop_count ,
1146- best_test_runtime = total_candidate_timing ,
1147- behavior_test_results = candidate_behavior_results ,
1148- benchmarking_test_results = candidate_benchmarking_results ,
1149- optimization_candidate_index = optimization_candidate_index ,
1150- total_candidate_timing = total_candidate_timing ,
1212+ (
1213+ OptimizedCandidateResult (
1214+ max_loop_count = loop_count ,
1215+ best_test_runtime = total_candidate_timing ,
1216+ behavior_test_results = candidate_behavior_results ,
1217+ benchmarking_test_results = candidate_benchmarking_results ,
1218+ optimization_candidate_index = optimization_candidate_index ,
1219+ total_candidate_timing = total_candidate_timing ,
1220+ ),
1221+ runtime_distribution ,
1222+ runtime_statistics ,
11511223 )
11521224 )
11531225
0 commit comments