Skip to content

Commit e750273

Browse files
Merge branch 'main' into catch-a-unicodedecode-exception
2 parents 103c23d + 0b2ff7c commit e750273

File tree

7 files changed

+468
-34
lines changed

7 files changed

+468
-34
lines changed

codeflash/optimization/optimizer.py

Lines changed: 93 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
from codeflash.result.critic import coverage_critic, performance_gain, quantity_of_tests_critic, speedup_critic
6767
from codeflash.result.explanation import Explanation
6868
from codeflash.telemetry.posthog_cf import ph
69+
from codeflash.verification.bayesian_analysis import compare_function_runtime_distributions
6970
from codeflash.verification.concolic_testing import generate_concolic_tests
7071
from codeflash.verification.equivalence import compare_test_results
7172
from codeflash.verification.parse_test_output import parse_test_results
@@ -77,6 +78,9 @@
7778
if TYPE_CHECKING:
7879
from argparse import Namespace
7980

81+
import numpy as np
82+
import numpy.typing as npt
83+
8084
from codeflash.either import Result
8185
from codeflash.models.models import CoverageData, FunctionSource, OptimizedCandidate
8286

@@ -352,7 +356,12 @@ def optimize_function(
352356
cleanup_paths(paths_to_cleanup)
353357
return Failure(baseline_result.failure())
354358

355-
original_code_baseline, test_functions_to_remove = baseline_result.unwrap()
359+
(
360+
original_code_baseline,
361+
original_code_runtime_distribution,
362+
original_code_runtime_statistics,
363+
test_functions_to_remove,
364+
) = baseline_result.unwrap()
356365
if isinstance(original_code_baseline, OriginalCodeBaseline) and not coverage_critic(
357366
original_code_baseline.coverage_results, self.args.test_framework
358367
):
@@ -371,6 +380,7 @@ def optimize_function(
371380
function_to_optimize=function_to_optimize,
372381
original_code=validated_original_code[function_to_optimize.file_path].source_code,
373382
original_code_baseline=original_code_baseline,
383+
original_code_runtime_distribution=original_code_runtime_distribution,
374384
original_helper_code=original_helper_code,
375385
function_trace_id=function_trace_id[:-4] + f"EXP{u}" if should_run_experiment else function_trace_id,
376386
)
@@ -480,11 +490,13 @@ def determine_best_candidate(
480490
function_to_optimize: FunctionToOptimize,
481491
original_code: str,
482492
original_code_baseline: OriginalCodeBaseline,
493+
original_code_runtime_distribution: npt.NDArray[np.float64],
483494
original_helper_code: dict[Path, str],
484495
function_trace_id: str,
485496
) -> BestOptimization | None:
486497
best_optimization: BestOptimization | None = None
487498
best_runtime_until_now = original_code_baseline.runtime
499+
best_speedup_ratio_until_now = 1.0
488500

489501
speedup_ratios: dict[str, float | None] = {}
490502
optimized_runtimes: dict[str, float | None] = {}
@@ -528,7 +540,9 @@ def determine_best_candidate(
528540
is_correct[candidate.optimization_id] = False
529541
speedup_ratios[candidate.optimization_id] = None
530542
else:
531-
candidate_result: OptimizedCandidateResult = run_results.unwrap()
543+
candidate_result, candidate_runtime_distribution, candidate_runtime_statistics = (
544+
run_results.unwrap()
545+
)
532546
best_test_runtime = candidate_result.best_test_runtime
533547
optimized_runtimes[candidate.optimization_id] = best_test_runtime
534548
is_correct[candidate.optimization_id] = True
@@ -537,18 +551,23 @@ def determine_best_candidate(
537551
)
538552
speedup_ratios[candidate.optimization_id] = perf_gain
539553

540-
tree = Tree(f"Candidate #{candidate_index} - Runtime Information")
554+
speedup_stats = compare_function_runtime_distributions(
555+
original_code_runtime_distribution, candidate_runtime_distribution
556+
)
557+
558+
tree = Tree(f"Candidate #{candidate_index} - Sum of Minimum Runtimes")
541559
if speedup_critic(
542560
candidate_result, original_code_baseline.runtime, best_runtime_until_now
543561
) and quantity_of_tests_critic(candidate_result):
544562
tree.add("This candidate is faster than the previous best candidate. 🚀")
545-
tree.add(f"Original runtime: {humanize_runtime(original_code_baseline.runtime)}")
563+
tree.add(f"Original summed runtime: {humanize_runtime(original_code_baseline.runtime)}")
546564
tree.add(
547-
f"Best test runtime: {humanize_runtime(candidate_result.best_test_runtime)} "
565+
f"Best summed runtime: {humanize_runtime(candidate_result.best_test_runtime)} "
548566
f"(measured over {candidate_result.max_loop_count} "
549567
f"loop{'s' if candidate_result.max_loop_count > 1 else ''})"
550568
)
551-
tree.add(f"Speedup ratio: {perf_gain:.3f}")
569+
tree.add(f"Speedup percentage: {perf_gain * 100:.1f}%")
570+
tree.add(f"Speedup ratio: {perf_gain + 1:.1f}X")
552571

553572
best_optimization = BestOptimization(
554573
candidate=candidate,
@@ -560,11 +579,35 @@ def determine_best_candidate(
560579
best_runtime_until_now = best_test_runtime
561580
else:
562581
tree.add(
563-
f"Runtime: {humanize_runtime(best_test_runtime)} "
582+
f"Summed runtime: {humanize_runtime(best_test_runtime)} "
564583
f"(measured over {candidate_result.max_loop_count} "
565584
f"loop{'s' if candidate_result.max_loop_count > 1 else ''})"
566585
)
567-
tree.add(f"Speedup ratio: {perf_gain:.3f}")
586+
tree.add(f"Speedup percentage: {perf_gain * 100:.1f}%")
587+
tree.add(f"Speedup ratio: {perf_gain + 1:.3f}X")
588+
console.print(tree)
589+
console.rule()
590+
591+
tree = Tree(f"Candidate #{candidate_index} - Bayesian Bootstrapping Nonparametric Analysis")
592+
tree.add(
593+
f"Expected candidate runtime (95% Credible Interval) = ["
594+
f"{humanize_runtime(candidate_runtime_statistics['credible_interval_lower_bound'])}, "
595+
f"{humanize_runtime(candidate_runtime_statistics['credible_interval_upper_bound'])}], "
596+
f"\nmedian = {humanize_runtime(candidate_runtime_statistics['median'])}"
597+
f"\nSpeedup ratio of candidate vs original:"
598+
f"\n95% Credible Interval = [{speedup_stats['credible_interval_lower_bound']:.3f}X, "
599+
f"{speedup_stats['credible_interval_upper_bound']:.3f}X]"
600+
f"\nmedian = {speedup_stats['median']:.3f}X"
601+
)
602+
if speedup_stats["credible_interval_lower_bound"] > 1.0:
603+
tree.add("The candidate is faster than the original code with a 95% probability.")
604+
if speedup_stats["median"] > best_speedup_ratio_until_now:
605+
best_speedup_ratio_until_now = speedup_stats["median"]
606+
tree.add("This candidate is the best candidate so far.")
607+
else:
608+
tree.add("This candidate is not faster than the current fastest candidate.")
609+
else:
610+
tree.add("It is inconclusive whether the candidate is faster than the original code.")
568611
console.print(tree)
569612
console.rule()
570613

@@ -941,7 +984,7 @@ def generate_tests_and_optimizations(
941984

942985
def establish_original_code_baseline(
943986
self, function_name: str, function_file_path: Path, code_context: CodeOptimizationContext
944-
) -> Result[tuple[OriginalCodeBaseline, list[str]], str]:
987+
) -> Result[tuple[OriginalCodeBaseline, npt.NDArray[np.float64], dict[str, np.float64], list[str]], str]:
945988
# For the original function - run the tests and get the runtime, plus coverage
946989
with progress_bar(f"Establishing original code baseline for {function_name}"):
947990
assert (test_framework := self.args.test_framework) in ["pytest", "unittest"]
@@ -1011,7 +1054,9 @@ def establish_original_code_baseline(
10111054
console.rule()
10121055

10131056
total_timing = benchmarking_results.total_passed_runtime() # caution: doesn't handle the loop index
1014-
1057+
runtime_distribution, runtime_statistics = benchmarking_results.bayesian_nonparametric_bootstrap_analysis(
1058+
100_000
1059+
)
10151060
functions_to_remove = [
10161061
result.id.test_function_name
10171062
for result in behavioral_results
@@ -1025,7 +1070,9 @@ def establish_original_code_baseline(
10251070
console.rule()
10261071
success = False
10271072
if total_timing == 0:
1028-
logger.warning("The overall test runtime of the original function is 0, couldn't run tests.")
1073+
logger.warning(
1074+
"The overall summed benchmark runtime of the original function is 0, couldn't run tests."
1075+
)
10291076
console.rule()
10301077
success = False
10311078
if not total_timing:
@@ -1037,11 +1084,20 @@ def establish_original_code_baseline(
10371084

10381085
loop_count = max([int(result.loop_index) for result in benchmarking_results.test_results])
10391086
logger.info(
1040-
f"Original code runtime measured over {loop_count} loop{'s' if loop_count > 1 else ''}: "
1087+
f"Original code summed runtime measured over {loop_count} loop{'s' if loop_count > 1 else ''}: "
10411088
f"{humanize_runtime(total_timing)} per full loop"
10421089
)
10431090
console.rule()
1044-
logger.debug(f"Total original code runtime (ns): {total_timing}")
1091+
logger.debug(f"Total original code summed runtime (ns): {total_timing}")
1092+
console.rule()
1093+
logger.info(
1094+
f"Bayesian Bootstrapping Nonparametric Analysis"
1095+
f"\nExpected original code runtime (95% Credible Interval) = ["
1096+
f"{humanize_runtime(round(runtime_statistics['credible_interval_lower_bound']))}, "
1097+
f"{humanize_runtime(round(runtime_statistics['credible_interval_upper_bound']))}], "
1098+
f"\nmedian: {humanize_runtime(round(runtime_statistics['median']))}"
1099+
)
1100+
10451101
return Success(
10461102
(
10471103
OriginalCodeBaseline(
@@ -1050,13 +1106,15 @@ def establish_original_code_baseline(
10501106
runtime=total_timing,
10511107
coverage_results=coverage_results,
10521108
),
1109+
runtime_distribution,
1110+
runtime_statistics,
10531111
functions_to_remove,
10541112
)
10551113
)
10561114

10571115
def run_optimized_candidate(
10581116
self, *, optimization_candidate_index: int, baseline_results: OriginalCodeBaseline
1059-
) -> Result[OptimizedCandidateResult, str]:
1117+
) -> Result[tuple[OptimizedCandidateResult, npt.NDArray[np.float64], dict[str, np.float64]], str]:
10601118
assert (test_framework := self.args.test_framework) in ["pytest", "unittest"]
10611119

10621120
with progress_bar("Testing optimization candidate"):
@@ -1138,16 +1196,30 @@ def run_optimized_candidate(
11381196
if (total_candidate_timing := candidate_benchmarking_results.total_passed_runtime()) == 0:
11391197
logger.warning("The overall test runtime of the optimized function is 0, couldn't run tests.")
11401198
console.rule()
1199+
runtime_distribution, runtime_statistics = (
1200+
candidate_benchmarking_results.bayesian_nonparametric_bootstrap_analysis(100_000)
1201+
)
11411202

11421203
logger.debug(f"Total optimized code {optimization_candidate_index} runtime (ns): {total_candidate_timing}")
1204+
console.rule()
1205+
logger.debug(
1206+
f"Overall code runtime (95% Credible Interval) = ["
1207+
f"{humanize_runtime(round(runtime_statistics['credible_interval_lower_bound']))}, "
1208+
f"{humanize_runtime(round(runtime_statistics['credible_interval_upper_bound']))}], median: "
1209+
f"{humanize_runtime(round(runtime_statistics['median']))}"
1210+
)
11431211
return Success(
1144-
OptimizedCandidateResult(
1145-
max_loop_count=loop_count,
1146-
best_test_runtime=total_candidate_timing,
1147-
behavior_test_results=candidate_behavior_results,
1148-
benchmarking_test_results=candidate_benchmarking_results,
1149-
optimization_candidate_index=optimization_candidate_index,
1150-
total_candidate_timing=total_candidate_timing,
1212+
(
1213+
OptimizedCandidateResult(
1214+
max_loop_count=loop_count,
1215+
best_test_runtime=total_candidate_timing,
1216+
behavior_test_results=candidate_behavior_results,
1217+
benchmarking_test_results=candidate_benchmarking_results,
1218+
optimization_candidate_index=optimization_candidate_index,
1219+
total_candidate_timing=total_candidate_timing,
1220+
),
1221+
runtime_distribution,
1222+
runtime_statistics,
11511223
)
11521224
)
11531225

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
import numba as nb
6+
import numpy as np
7+
8+
if TYPE_CHECKING:
9+
import numpy.typing as npt
10+
11+
12+
@nb.njit(parallel=True, fastmath=True, cache=True)
13+
def bayesian_bootstrap_runtime_means(
14+
runtimes: list[int], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
15+
) -> npt.NDArray[np.float64]:
16+
"""Bayesian bootstrap for the mean of the runtimes list.
17+
18+
Returns an array of shape (bootstrap_size,) with draws from the posterior of the mean.
19+
We draw random weights from Dirichlet(1,1,...,1) using the rngs random generators
20+
(one per computation thread), and compute the weighted mean.
21+
"""
22+
num_timings = len(runtimes)
23+
np_runtimes = np.array(runtimes).astype(np.float64)
24+
draws = np.empty(bootstrap_size, dtype=np.float64)
25+
26+
num_threads = len(rngs)
27+
thread_remainder = bootstrap_size % num_threads
28+
num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
29+
[1] * thread_remainder + [0] * (num_threads - thread_remainder)
30+
)
31+
thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
32+
33+
for thread_id in nb.prange(num_threads):
34+
thread_draws = draws[thread_idx[thread_id] : thread_idx[thread_id + 1]]
35+
for bootstrap_id in range(num_bootstraps_per_thread[thread_id]):
36+
# Dirichlet(1,...,1) is the normalized Gamma(1,1) distribution
37+
weights = rngs[thread_id].gamma(1.0, 1.0, size=num_timings)
38+
thread_draws[bootstrap_id] = np.dot(np_runtimes, weights / np.sum(weights))
39+
return draws
40+
41+
42+
def compute_function_runtime_posterior_means(
43+
function_runtime_data: list[list[int]], bootstrap_size: int
44+
) -> list[npt.NDArray[np.float64]]:
45+
"""For each list of runtimes associated to a function input, do a Bayesian bootstrap to get a posterior of the mean.
46+
47+
Returns an array of shape (bootstrap_size,) for each function input.
48+
"""
49+
rng = np.random.default_rng()
50+
return [
51+
bayesian_bootstrap_runtime_means(input_runtime_data, tuple(rng.spawn(nb.get_num_threads())), bootstrap_size)
52+
for input_runtime_data in function_runtime_data
53+
]
54+
55+
56+
@nb.njit(parallel=True, fastmath=True, cache=True)
57+
def bootstrap_combined_function_input_runtime_means(
58+
posterior_means: list[npt.NDArray[np.float64]], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
59+
) -> npt.NDArray[np.float64]:
60+
"""Given a function, we have posterior draws for each input, and get an overall expected time across these inputs.
61+
62+
We make random draws from each input's distribution using the rngs random generators (one per computation thread),
63+
and compute their arithmetic mean.
64+
Returns an array of shape (bootstrap_size,).
65+
"""
66+
num_inputs = len(posterior_means)
67+
num_input_means = max([len(posterior_mean) for posterior_mean in posterior_means])
68+
draws = np.empty(bootstrap_size, dtype=np.float64)
69+
70+
num_threads = len(rngs)
71+
thread_remainder = bootstrap_size % num_threads
72+
num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
73+
[1] * thread_remainder + [0] * (num_threads - thread_remainder)
74+
)
75+
thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
76+
77+
for thread_id in nb.prange(num_threads):
78+
thread_draws = draws[thread_idx[thread_id] : thread_idx[thread_id + 1]]
79+
for bootstrap_id in range(num_bootstraps_per_thread[thread_id]):
80+
thread_draws[bootstrap_id] = (
81+
sum([input_means[rngs[thread_id].integers(0, num_input_means)] for input_means in posterior_means])
82+
/ num_inputs
83+
)
84+
return draws
85+
86+
87+
def compute_statistics(distribution: npt.NDArray[np.float64], gamma: float = 0.95) -> dict[str, np.float64]:
88+
lower_p = (1.0 - gamma) / 2 * 100
89+
return {
90+
"median": np.median(distribution),
91+
"credible_interval_lower_bound": np.percentile(distribution, lower_p),
92+
"credible_interval_upper_bound": np.percentile(distribution, 100 - lower_p),
93+
}
94+
95+
96+
def analyze_function_runtime_data(
97+
function_runtime_data: list[list[int]], bootstrap_size: int
98+
) -> tuple[npt.NDArray[np.float64], dict[str, np.float64]]:
99+
rng = np.random.default_rng()
100+
function_runtime_distribution = bootstrap_combined_function_input_runtime_means(
101+
compute_function_runtime_posterior_means(function_runtime_data, bootstrap_size),
102+
tuple(rng.spawn(nb.get_num_threads())),
103+
bootstrap_size,
104+
)
105+
return function_runtime_distribution, compute_statistics(function_runtime_distribution)
106+
107+
108+
def compare_function_runtime_distributions(
109+
function1_runtime_distribution: npt.NDArray[np.float64], function2_runtime_distribution: npt.NDArray[np.float64]
110+
) -> dict[str, np.float64]:
111+
speedup_distribution = function1_runtime_distribution / function2_runtime_distribution
112+
return compute_statistics(speedup_distribution)

0 commit comments

Comments
 (0)