codeflash-ai
diff --git a/‎codeflash/optimization/optimizer.py‎
Lines changed: 93 additions & 21 deletions b/‎codeflash/optimization/optimizer.py‎
Lines changed: 93 additions & 21 deletions
diff --git a/‎codeflash/verification/bayesian_analysis.py‎
Lines changed: 112 additions & 0 deletions b/‎codeflash/verification/bayesian_analysis.py‎
Lines changed: 112 additions & 0 deletions
@@ -66,6 +66,7 @@
 from codeflash.result.critic import coverage_critic, performance_gain, quantity_of_tests_critic, speedup_critic
 from codeflash.result.explanation import Explanation
 from codeflash.telemetry.posthog_cf import ph
+from codeflash.verification.bayesian_analysis import compare_function_runtime_distributions
 from codeflash.verification.concolic_testing import generate_concolic_tests
 from codeflash.verification.equivalence import compare_test_results
 from codeflash.verification.parse_test_output import parse_test_results
@@ -77,6 +78,9 @@
 if TYPE_CHECKING:
     from argparse import Namespace
 
+    import numpy as np
+    import numpy.typing as npt
+
     from codeflash.either import Result
     from codeflash.models.models import CoverageData, FunctionSource, OptimizedCandidate
 
@@ -352,7 +356,12 @@ def optimize_function(
             cleanup_paths(paths_to_cleanup)
             return Failure(baseline_result.failure())
 
-        original_code_baseline, test_functions_to_remove = baseline_result.unwrap()
+        (
+            original_code_baseline,
+            original_code_runtime_distribution,
+            original_code_runtime_statistics,
+            test_functions_to_remove,
+        ) = baseline_result.unwrap()
         if isinstance(original_code_baseline, OriginalCodeBaseline) and not coverage_critic(
             original_code_baseline.coverage_results, self.args.test_framework
         ):
@@ -371,6 +380,7 @@ def optimize_function(
                 function_to_optimize=function_to_optimize,
                 original_code=validated_original_code[function_to_optimize.file_path].source_code,
                 original_code_baseline=original_code_baseline,
+                original_code_runtime_distribution=original_code_runtime_distribution,
                 original_helper_code=original_helper_code,
                 function_trace_id=function_trace_id[:-4] + f"EXP{u}" if should_run_experiment else function_trace_id,
             )
@@ -480,11 +490,13 @@ def determine_best_candidate(
         function_to_optimize: FunctionToOptimize,
         original_code: str,
         original_code_baseline: OriginalCodeBaseline,
+        original_code_runtime_distribution: npt.NDArray[np.float64],
         original_helper_code: dict[Path, str],
         function_trace_id: str,
     ) -> BestOptimization | None:
         best_optimization: BestOptimization | None = None
         best_runtime_until_now = original_code_baseline.runtime
+        best_speedup_ratio_until_now = 1.0
 
         speedup_ratios: dict[str, float | None] = {}
         optimized_runtimes: dict[str, float | None] = {}
@@ -528,7 +540,9 @@ def determine_best_candidate(
                     is_correct[candidate.optimization_id] = False
                     speedup_ratios[candidate.optimization_id] = None
                 else:
-                    candidate_result: OptimizedCandidateResult = run_results.unwrap()
+                    candidate_result, candidate_runtime_distribution, candidate_runtime_statistics = (
+                        run_results.unwrap()
+                    )
                     best_test_runtime = candidate_result.best_test_runtime
                     optimized_runtimes[candidate.optimization_id] = best_test_runtime
                     is_correct[candidate.optimization_id] = True
@@ -537,18 +551,23 @@ def determine_best_candidate(
                     )
                     speedup_ratios[candidate.optimization_id] = perf_gain
 
-                    tree = Tree(f"Candidate #{candidate_index} - Runtime Information")
+                    speedup_stats = compare_function_runtime_distributions(
+                        original_code_runtime_distribution, candidate_runtime_distribution
+                    )
+
+                    tree = Tree(f"Candidate #{candidate_index} - Sum of Minimum Runtimes")
                     if speedup_critic(
                         candidate_result, original_code_baseline.runtime, best_runtime_until_now
                     ) and quantity_of_tests_critic(candidate_result):
                         tree.add("This candidate is faster than the previous best candidate. 🚀")
-                        tree.add(f"Original runtime: {humanize_runtime(original_code_baseline.runtime)}")
+                        tree.add(f"Original summed runtime: {humanize_runtime(original_code_baseline.runtime)}")
                         tree.add(
-                            f"Best test runtime: {humanize_runtime(candidate_result.best_test_runtime)} "
+                            f"Best summed runtime: {humanize_runtime(candidate_result.best_test_runtime)} "
                             f"(measured over {candidate_result.max_loop_count} "
                             f"loop{'s' if candidate_result.max_loop_count > 1 else ''})"
                         )
-                        tree.add(f"Speedup ratio: {perf_gain:.3f}")
+                        tree.add(f"Speedup percentage: {perf_gain * 100:.1f}%")
+                        tree.add(f"Speedup ratio: {perf_gain + 1:.1f}X")
 
                         best_optimization = BestOptimization(
                             candidate=candidate,
@@ -560,11 +579,35 @@ def determine_best_candidate(
                         best_runtime_until_now = best_test_runtime
                     else:
                         tree.add(
-                            f"Runtime: {humanize_runtime(best_test_runtime)} "
+                            f"Summed runtime: {humanize_runtime(best_test_runtime)} "
                             f"(measured over {candidate_result.max_loop_count} "
                             f"loop{'s' if candidate_result.max_loop_count > 1 else ''})"
                         )
-                        tree.add(f"Speedup ratio: {perf_gain:.3f}")
+                        tree.add(f"Speedup percentage: {perf_gain * 100:.1f}%")
+                        tree.add(f"Speedup ratio: {perf_gain + 1:.3f}X")
+                    console.print(tree)
+                    console.rule()
+
+                    tree = Tree(f"Candidate #{candidate_index} - Bayesian Bootstrapping Nonparametric Analysis")
+                    tree.add(
+                        f"Expected candidate runtime (95% Credible Interval) = ["
+                        f"{humanize_runtime(candidate_runtime_statistics['credible_interval_lower_bound'])}, "
+                        f"{humanize_runtime(candidate_runtime_statistics['credible_interval_upper_bound'])}], "
+                        f"\nmedian = {humanize_runtime(candidate_runtime_statistics['median'])}"
+                        f"\nSpeedup ratio of candidate vs original:"
+                        f"\n95% Credible Interval = [{speedup_stats['credible_interval_lower_bound']:.3f}X, "
+                        f"{speedup_stats['credible_interval_upper_bound']:.3f}X]"
+                        f"\nmedian = {speedup_stats['median']:.3f}X"
+                    )
+                    if speedup_stats["credible_interval_lower_bound"] > 1.0:
+                        tree.add("The candidate is faster than the original code with a 95% probability.")
+                        if speedup_stats["median"] > best_speedup_ratio_until_now:
+                            best_speedup_ratio_until_now = speedup_stats["median"]
+                            tree.add("This candidate is the best candidate so far.")
+                        else:
+                            tree.add("This candidate is not faster than the current fastest candidate.")
+                    else:
+                        tree.add("It is inconclusive whether the candidate is faster than the original code.")
                     console.print(tree)
                     console.rule()
 
@@ -941,7 +984,7 @@ def generate_tests_and_optimizations(
 
     def establish_original_code_baseline(
         self, function_name: str, function_file_path: Path, code_context: CodeOptimizationContext
-    ) -> Result[tuple[OriginalCodeBaseline, list[str]], str]:
+    ) -> Result[tuple[OriginalCodeBaseline, npt.NDArray[np.float64], dict[str, np.float64], list[str]], str]:
         # For the original function - run the tests and get the runtime, plus coverage
         with progress_bar(f"Establishing original code baseline for {function_name}"):
             assert (test_framework := self.args.test_framework) in ["pytest", "unittest"]
@@ -1011,7 +1054,9 @@ def establish_original_code_baseline(
             console.rule()
 
             total_timing = benchmarking_results.total_passed_runtime()  # caution: doesn't handle the loop index
-
+            runtime_distribution, runtime_statistics = benchmarking_results.bayesian_nonparametric_bootstrap_analysis(
+                100_000
+            )
             functions_to_remove = [
                 result.id.test_function_name
                 for result in behavioral_results
@@ -1025,7 +1070,9 @@ def establish_original_code_baseline(
                 console.rule()
                 success = False
             if total_timing == 0:
-                logger.warning("The overall test runtime of the original function is 0, couldn't run tests.")
+                logger.warning(
+                    "The overall summed benchmark runtime of the original function is 0, couldn't run tests."
+                )
                 console.rule()
                 success = False
             if not total_timing:
@@ -1037,11 +1084,20 @@ def establish_original_code_baseline(
 
             loop_count = max([int(result.loop_index) for result in benchmarking_results.test_results])
             logger.info(
-                f"Original code runtime measured over {loop_count} loop{'s' if loop_count > 1 else ''}: "
+                f"Original code summed runtime measured over {loop_count} loop{'s' if loop_count > 1 else ''}: "
                 f"{humanize_runtime(total_timing)} per full loop"
             )
             console.rule()
-            logger.debug(f"Total original code runtime (ns): {total_timing}")
+            logger.debug(f"Total original code summed runtime (ns): {total_timing}")
+            console.rule()
+            logger.info(
+                f"Bayesian Bootstrapping Nonparametric Analysis"
+                f"\nExpected original code runtime (95% Credible Interval) = ["
+                f"{humanize_runtime(round(runtime_statistics['credible_interval_lower_bound']))}, "
+                f"{humanize_runtime(round(runtime_statistics['credible_interval_upper_bound']))}], "
+                f"\nmedian: {humanize_runtime(round(runtime_statistics['median']))}"
+            )
+
             return Success(
                 (
                     OriginalCodeBaseline(
@@ -1050,13 +1106,15 @@ def establish_original_code_baseline(
                         runtime=total_timing,
                         coverage_results=coverage_results,
                     ),
+                    runtime_distribution,
+                    runtime_statistics,
                     functions_to_remove,
                 )
             )
 
     def run_optimized_candidate(
         self, *, optimization_candidate_index: int, baseline_results: OriginalCodeBaseline
-    ) -> Result[OptimizedCandidateResult, str]:
+    ) -> Result[tuple[OptimizedCandidateResult, npt.NDArray[np.float64], dict[str, np.float64]], str]:
         assert (test_framework := self.args.test_framework) in ["pytest", "unittest"]
 
         with progress_bar("Testing optimization candidate"):
@@ -1138,16 +1196,30 @@ def run_optimized_candidate(
             if (total_candidate_timing := candidate_benchmarking_results.total_passed_runtime()) == 0:
                 logger.warning("The overall test runtime of the optimized function is 0, couldn't run tests.")
                 console.rule()
+            runtime_distribution, runtime_statistics = (
+                candidate_benchmarking_results.bayesian_nonparametric_bootstrap_analysis(100_000)
+            )
 
             logger.debug(f"Total optimized code {optimization_candidate_index} runtime (ns): {total_candidate_timing}")
+            console.rule()
+            logger.debug(
+                f"Overall code runtime (95% Credible Interval) = ["
+                f"{humanize_runtime(round(runtime_statistics['credible_interval_lower_bound']))}, "
+                f"{humanize_runtime(round(runtime_statistics['credible_interval_upper_bound']))}], median: "
+                f"{humanize_runtime(round(runtime_statistics['median']))}"
+            )
             return Success(
-                OptimizedCandidateResult(
-                    max_loop_count=loop_count,
-                    best_test_runtime=total_candidate_timing,
-                    behavior_test_results=candidate_behavior_results,
-                    benchmarking_test_results=candidate_benchmarking_results,
-                    optimization_candidate_index=optimization_candidate_index,
-                    total_candidate_timing=total_candidate_timing,
+                (
+                    OptimizedCandidateResult(
+                        max_loop_count=loop_count,
+                        best_test_runtime=total_candidate_timing,
+                        behavior_test_results=candidate_behavior_results,
+                        benchmarking_test_results=candidate_benchmarking_results,
+                        optimization_candidate_index=optimization_candidate_index,
+                        total_candidate_timing=total_candidate_timing,
+                    ),
+                    runtime_distribution,
+                    runtime_statistics,
                 )
             )
 
 
@@ -0,0 +1,112 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numba as nb
+import numpy as np
+
+if TYPE_CHECKING:
+    import numpy.typing as npt
+
+
+@nb.njit(parallel=True, fastmath=True, cache=True)
+def bayesian_bootstrap_runtime_means(
+    runtimes: list[int], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
+) -> npt.NDArray[np.float64]:
+    """Bayesian bootstrap for the mean of the runtimes list.
+
+    Returns an array of shape (bootstrap_size,) with draws from the posterior of the mean.
+    We draw random weights from Dirichlet(1,1,...,1) using the rngs random generators
+    (one per computation thread), and compute the weighted mean.
+    """
+    num_timings = len(runtimes)
+    np_runtimes = np.array(runtimes).astype(np.float64)
+    draws = np.empty(bootstrap_size, dtype=np.float64)
+
+    num_threads = len(rngs)
+    thread_remainder = bootstrap_size % num_threads
+    num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
+        [1] * thread_remainder + [0] * (num_threads - thread_remainder)
+    )
+    thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
+
+    for thread_id in nb.prange(num_threads):
+        thread_draws = draws[thread_idx[thread_id] : thread_idx[thread_id + 1]]
+        for bootstrap_id in range(num_bootstraps_per_thread[thread_id]):
+            # Dirichlet(1,...,1) is the normalized Gamma(1,1) distribution
+            weights = rngs[thread_id].gamma(1.0, 1.0, size=num_timings)
+            thread_draws[bootstrap_id] = np.dot(np_runtimes, weights / np.sum(weights))
+    return draws
+
+
+def compute_function_runtime_posterior_means(
+    function_runtime_data: list[list[int]], bootstrap_size: int
+) -> list[npt.NDArray[np.float64]]:
+    """For each list of runtimes associated to a function input, do a Bayesian bootstrap to get a posterior of the mean.
+
+    Returns an array of shape (bootstrap_size,) for each function input.
+    """
+    rng = np.random.default_rng()
+    return [
+        bayesian_bootstrap_runtime_means(input_runtime_data, tuple(rng.spawn(nb.get_num_threads())), bootstrap_size)
+        for input_runtime_data in function_runtime_data
+    ]
+
+
+@nb.njit(parallel=True, fastmath=True, cache=True)
+def bootstrap_combined_function_input_runtime_means(
+    posterior_means: list[npt.NDArray[np.float64]], rngs: tuple[np.random.Generator, ...], bootstrap_size: int
+) -> npt.NDArray[np.float64]:
+    """Given a function, we have posterior draws for each input, and get an overall expected time across these inputs.
+
+    We make random draws from each input's distribution using the rngs random generators (one per computation thread),
+    and compute their arithmetic mean.
+    Returns an array of shape (bootstrap_size,).
+    """
+    num_inputs = len(posterior_means)
+    num_input_means = max([len(posterior_mean) for posterior_mean in posterior_means])
+    draws = np.empty(bootstrap_size, dtype=np.float64)
+
+    num_threads = len(rngs)
+    thread_remainder = bootstrap_size % num_threads
+    num_bootstraps_per_thread = np.array([bootstrap_size // num_threads] * num_threads) + np.array(
+        [1] * thread_remainder + [0] * (num_threads - thread_remainder)
+    )
+    thread_idx = [0, *list(np.cumsum(num_bootstraps_per_thread))]
+
+    for thread_id in nb.prange(num_threads):
+        thread_draws = draws[thread_idx[thread_id] : thread_idx[thread_id + 1]]
+        for bootstrap_id in range(num_bootstraps_per_thread[thread_id]):
+            thread_draws[bootstrap_id] = (
+                sum([input_means[rngs[thread_id].integers(0, num_input_means)] for input_means in posterior_means])
+                / num_inputs
+            )
+    return draws
+
+
+def compute_statistics(distribution: npt.NDArray[np.float64], gamma: float = 0.95) -> dict[str, np.float64]:
+    lower_p = (1.0 - gamma) / 2 * 100
+    return {
+        "median": np.median(distribution),
+        "credible_interval_lower_bound": np.percentile(distribution, lower_p),
+        "credible_interval_upper_bound": np.percentile(distribution, 100 - lower_p),
+    }
+
+
+def analyze_function_runtime_data(
+    function_runtime_data: list[list[int]], bootstrap_size: int
+) -> tuple[npt.NDArray[np.float64], dict[str, np.float64]]:
+    rng = np.random.default_rng()
+    function_runtime_distribution = bootstrap_combined_function_input_runtime_means(
+        compute_function_runtime_posterior_means(function_runtime_data, bootstrap_size),
+        tuple(rng.spawn(nb.get_num_threads())),
+        bootstrap_size,
+    )
+    return function_runtime_distribution, compute_statistics(function_runtime_distribution)
+
+
+def compare_function_runtime_distributions(
+    function1_runtime_distribution: npt.NDArray[np.float64], function2_runtime_distribution: npt.NDArray[np.float64]
+) -> dict[str, np.float64]:
+    speedup_distribution = function1_runtime_distribution / function2_runtime_distribution
+    return compute_statistics(speedup_distribution)