Merge remote-tracking branch 'origin/main' into feat/feedback-loop-for-unmatched-test-results

aseembits93 · aseembits93 · commit d165a15f21da · 2025-12-18T22:15:30.000-05:00
diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
@@ -259,20 +259,18 @@ def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]
                 "original_source_code": opt.original_source_code,
                 "read_only_dependency_code": opt.read_only_dependency_code,
                 "original_line_profiler_results": opt.original_line_profiler_results,
-                "original_code_runtime": opt.original_code_runtime,
+                "original_code_runtime": humanize_runtime(opt.original_code_runtime),
                 "optimized_source_code": opt.optimized_source_code,
                 "optimized_explanation": opt.optimized_explanation,
                 "optimized_line_profiler_results": opt.optimized_line_profiler_results,
-                "optimized_code_runtime": opt.optimized_code_runtime,
+                "optimized_code_runtime": humanize_runtime(opt.optimized_code_runtime),
                 "speedup": opt.speedup,
                 "trace_id": opt.trace_id,
                 "function_references": opt.function_references,
                 "python_version": platform.python_version(),
             }
             for opt in request
         ]
-        logger.debug(f"Refining {len(request)} optimizations…")
-        console.rule()
         try:
             response = self.make_ai_service_request("/refinement", payload=payload, timeout=120)
         except requests.exceptions.RequestException as e:
@@ -282,8 +280,6 @@ def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]
 
         if response.status_code == 200:
             refined_optimizations = response.json()["refinements"]
-            logger.debug(f"Generated {len(refined_optimizations)} candidate refinements.")
-            console.rule()
 
             return self._get_valid_candidates(refined_optimizations, OptimizedCandidateSource.REFINE)
 
diff --git a/codeflash/code_utils/code_utils.py b/codeflash/code_utils/code_utils.py
@@ -45,6 +45,63 @@ def unified_diff_strings(code1: str, code2: str, fromfile: str = "original", tof
     return "".join(diff)
 
 
+def choose_weights(**importance: float) -> list[float]:
+    """Choose normalized weights from relative importance values.
+
+    Example:
+        choose_weights(runtime=3, diff=1)
+        -> [0.75, 0.25]
+
+    Args:
+        **importance: keyword args of metric=importance (relative numbers).
+
+    Returns:
+        A list of weights in the same order as the arguments.
+
+    """
+    total = sum(importance.values())
+    if total == 0:
+        raise ValueError("At least one importance value must be > 0")
+
+    return [v / total for v in importance.values()]
+
+
+def normalize_by_max(values: list[float]) -> list[float]:
+    mx = max(values)
+    if mx == 0:
+        return [0.0] * len(values)
+    return [v / mx for v in values]
+
+
+def create_score_dictionary_from_metrics(weights: list[float], *metrics: list[float]) -> dict[int, int]:
+    """Combine multiple metrics into a single weighted score dictionary.
+
+    Each metric is a list of values (smaller = better).
+    The total score for each index is the weighted sum of its values
+    across all metrics:
+
+        score[index] = Σ (value * weight)
+
+    Args:
+        weights: A list of weights, one per metric. Larger weight = more influence.
+        *metrics: Lists of values (one list per metric, aligned by index).
+
+    Returns:
+        A dictionary mapping each index to its combined weighted score.
+
+    """
+    if len(weights) != len(metrics):
+        raise ValueError("Number of weights must match number of metrics")
+
+    combined: dict[int, float] = {}
+
+    for weight, metric in zip(weights, metrics):
+        for idx, value in enumerate(metric):
+            combined[idx] = combined.get(idx, 0) + value * weight
+
+    return combined
+
+
 def diff_length(a: str, b: str) -> int:
     """Compute the length (in characters) of the unified diff between two strings.
 
diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py
@@ -14,6 +14,11 @@
 DEFAULT_IMPORTANCE_THRESHOLD = 0.001
 N_CANDIDATES_LP = 6
 
+# Refinement
+REFINE_ALL_THRESHOLD = 2  # when valid optimizations count is 2 or less, refine all optimizations
+REFINED_CANDIDATE_RANKING_WEIGHTS = (2, 1)  # (runtime, diff), runtime is more important than diff by a factor of 2
+TOP_N_REFINEMENTS = 0.45  # top 45% of valid optimizations (based on the weighted score) are refined
+
 # LSP-specific
 N_CANDIDATES_LSP = 3
 N_TESTS_TO_GENERATE_LSP = 2
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -37,10 +37,10 @@ class AIServiceRefinerRequest:
     optimization_id: str
     original_source_code: str
     read_only_dependency_code: str
-    original_code_runtime: str
+    original_code_runtime: int
     optimized_source_code: str
     optimized_explanation: str
-    optimized_code_runtime: str
+    optimized_code_runtime: int
     speedup: str
     trace_id: str
     original_line_profiler_results: str
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -29,13 +29,16 @@
     replace_function_definitions_in_module,
 )
 from codeflash.code_utils.code_utils import (
+    choose_weights,
     cleanup_paths,
     create_rank_dictionary_compact,
+    create_score_dictionary_from_metrics,
     diff_length,
     extract_unique_errors,
     file_name_from_test_module_name,
     get_run_tmp_file,
     module_name_from_file_path,
+    normalize_by_max,
     restore_conftest,
     unified_diff_strings,
 )
@@ -46,8 +49,11 @@
     N_CANDIDATES_EFFECTIVE,
     N_CANDIDATES_LP_EFFECTIVE,
     N_TESTS_TO_GENERATE_EFFECTIVE,
+    REFINE_ALL_THRESHOLD,
+    REFINED_CANDIDATE_RANKING_WEIGHTS,
     REPAIR_UNMATCHED_PERCENTAGE_LIMIT,
     REPEAT_OPTIMIZATION_PROBABILITY,
+    TOP_N_REFINEMENTS,
     TOTAL_LOOPING_TIME_EFFECTIVE,
 )
 from codeflash.code_utils.deduplicate_code import normalize_code
@@ -129,20 +135,24 @@ def __init__(
         self,
         initial_candidates: list,
         future_line_profile_results: concurrent.futures.Future,
-        future_all_refinements: list[concurrent.futures.Future],
+        all_refinements_data: list[AIServiceRefinerRequest],
+        ai_service_client: AiServiceClient,
+        executor: concurrent.futures.ThreadPoolExecutor,
         future_all_code_repair: list[concurrent.futures.Future],
     ) -> None:
         self.candidate_queue = queue.Queue()
         self.line_profiler_done = False
         self.refinement_done = False
         self.candidate_len = len(initial_candidates)
+        self.ai_service_client = ai_service_client
+        self.executor = executor
 
         # Initialize queue with initial candidates
         for candidate in initial_candidates:
             self.candidate_queue.put(candidate)
 
         self.future_line_profile_results = future_line_profile_results
-        self.future_all_refinements = future_all_refinements
+        self.all_refinements_data = all_refinements_data
         self.future_all_code_repair = future_all_code_repair
 
     def get_next_candidate(self) -> OptimizedCandidate | None:
@@ -177,15 +187,45 @@ def _process_line_profiler_results(self) -> OptimizedCandidate | None:
 
         return self.get_next_candidate()
 
+    def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concurrent.futures.Future:
+        return self.executor.submit(self.ai_service_client.optimize_python_code_refinement, request=request)
+
     def _process_refinement_results(self) -> OptimizedCandidate | None:
-        """Process refinement results and add to queue."""
-        if self.future_all_refinements:
+        """Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined."""
+        future_refinements: list[concurrent.futures.Future] = []
+
+        if len(self.all_refinements_data) <= REFINE_ALL_THRESHOLD:
+            for data in self.all_refinements_data:
+                future_refinements.append(self.refine_optimizations([data]))  # noqa: PERF401
+        else:
+            diff_lens_list = []
+            runtimes_list = []
+            for c in self.all_refinements_data:
+                diff_lens_list.append(diff_length(c.original_source_code, c.optimized_source_code))
+                runtimes_list.append(c.optimized_code_runtime)
+
+            runtime_w, diff_w = REFINED_CANDIDATE_RANKING_WEIGHTS
+            weights = choose_weights(runtime=runtime_w, diff=diff_w)
+
+            runtime_norm = normalize_by_max(runtimes_list)
+            diffs_norm = normalize_by_max(diff_lens_list)
+            # the lower the better
+            score_dict = create_score_dictionary_from_metrics(weights, runtime_norm, diffs_norm)
+            top_n_candidates = int((TOP_N_REFINEMENTS * len(runtimes_list)) + 0.5)
+            top_indecies = sorted(score_dict, key=score_dict.get)[:top_n_candidates]
+
+            for idx in top_indecies:
+                data = self.all_refinements_data[idx]
+                future_refinements.append(self.refine_optimizations([data]))
+
+        if future_refinements:
             logger.info("loading|Refining generated code for improved quality and performance...")
-        concurrent.futures.wait(self.future_all_refinements)
+
+        concurrent.futures.wait(future_refinements)
         refinement_response = []
 
-        for future_refinement in self.future_all_refinements:
-            possible_refinement = future_refinement.result()
+        for f in future_refinements:
+            possible_refinement = f.result()
             if len(possible_refinement) > 0:
                 refinement_response.append(possible_refinement[0])
 
@@ -197,7 +237,6 @@ def _process_refinement_results(self) -> OptimizedCandidate | None:
             logger.info(
                 f"Added {len(refinement_response)} candidates from refinement, total candidates now: {self.candidate_len}"
             )
-        self.future_all_refinements = []
         self.refinement_done = True
 
         return self.get_next_candidate()
@@ -278,7 +317,6 @@ def __init__(
             max_workers=n_tests + 3 if self.experiment_id is None else n_tests + 4
         )
         self.optimization_review = ""
-        self.future_all_refinements: list[concurrent.futures.Future] = []
         self.future_all_code_repair: list[concurrent.futures.Future] = []
         self.repair_counter = 0  # track how many repairs we did for each function
 
@@ -724,14 +762,15 @@ def process_single_candidate(
         original_helper_code: dict[Path, str],
         file_path_to_helper_classes: dict[Path, set[str]],
         eval_ctx: CandidateEvaluationContext,
+        all_refinements_data: list[AIServiceRefinerRequest],
         ai_service_client: AiServiceClient,
         exp_type: str,
         function_references: str,
     ) -> BestOptimization | None:
         """Process a single optimization candidate.
 
         Returns the BestOptimization if the candidate is successful, None otherwise.
-        Updates eval_ctx with results and may append to future_all_refinements.
+        Updates eval_ctx with results and may append to all_refinements_data.
         """
         # Cleanup temp files
         get_run_tmp_file(Path(f"test_return_values_{candidate_index}.bin")).unlink(missing_ok=True)
@@ -829,17 +868,23 @@ def process_single_candidate(
 
             # Queue refinement for non-refined candidates
             if candidate.source != OptimizedCandidateSource.REFINE:
-                self.future_all_refinements.append(
-                    self.refine_optimizations(
-                        valid_optimizations=[best_optimization],
-                        original_code_baseline=original_code_baseline,
-                        code_context=code_context,
+                all_refinements_data.append(
+                    AIServiceRefinerRequest(
+                        optimization_id=best_optimization.candidate.optimization_id,
+                        original_source_code=code_context.read_writable_code.markdown,
+                        read_only_dependency_code=code_context.read_only_context_code,
+                        original_code_runtime=original_code_baseline.runtime,
+                        optimized_source_code=best_optimization.candidate.source_code.markdown,
+                        optimized_explanation=best_optimization.candidate.explanation,
+                        optimized_code_runtime=best_optimization.runtime,
+                        speedup=f"{int(performance_gain(original_runtime_ns=original_code_baseline.runtime, optimized_runtime_ns=best_optimization.runtime) * 100)}%",
                         trace_id=self.get_trace_id(exp_type),
-                        ai_service_client=ai_service_client,
-                        executor=self.executor,
+                        original_line_profiler_results=original_code_baseline.line_profile_results["str_out"],
+                        optimized_line_profiler_results=best_optimization.line_profiler_test_results["str_out"],
                         function_references=function_references,
                     )
-                )
+                        original_line_profiler_results=original_code_baseline.line_profile_results["str_out"],
+                        optimized_line_profiler_results=best_optimization.line_profiler_test_results["str_out"],
 
         # Display runtime information
         if is_LSP_enabled():
@@ -872,7 +917,7 @@ def determine_best_candidate(
 
         # Initialize evaluation context and async tasks
         eval_ctx = CandidateEvaluationContext()
-        self.future_all_refinements.clear()
+        all_refinements_data: list[AIServiceRefinerRequest] = []
         self.future_all_code_repair.clear()
         self.repair_counter = 0
 
@@ -894,7 +939,7 @@ def determine_best_candidate(
         )
 
         processor = CandidateProcessor(
-            candidates, future_line_profile_results, self.future_all_refinements, self.future_all_code_repair
+            candidates, future_line_profile_results, all_refinements_data, self.future_all_code_repair, self.aiservice_client, self.executor
         )
         candidate_index = 0
 
@@ -916,6 +961,7 @@ def determine_best_candidate(
                     original_helper_code=original_helper_code,
                     file_path_to_helper_classes=file_path_to_helper_classes,
                     eval_ctx=eval_ctx,
+                    all_refinements_data=all_refinements_data,
                     ai_service_client=ai_service_client,
                     exp_type=exp_type,
                     function_references=function_references,
@@ -949,54 +995,6 @@ def determine_best_candidate(
 
         return best_optimization
 
-    def refine_optimizations(
-        self,
-        valid_optimizations: list[BestOptimization],
-        original_code_baseline: OriginalCodeBaseline,
-        code_context: CodeOptimizationContext,
-        trace_id: str,
-        ai_service_client: AiServiceClient,
-        executor: concurrent.futures.ThreadPoolExecutor,
-        function_references: str | None = None,
-    ) -> concurrent.futures.Future:
-        request = [
-            AIServiceRefinerRequest(
-                optimization_id=opt.candidate.optimization_id,
-                original_source_code=code_context.read_writable_code.markdown,
-                read_only_dependency_code=code_context.read_only_context_code,
-                original_code_runtime=humanize_runtime(original_code_baseline.runtime),
-                optimized_source_code=opt.candidate.source_code.markdown,
-                optimized_explanation=opt.candidate.explanation,
-                optimized_code_runtime=humanize_runtime(opt.runtime),
-                speedup=f"{int(performance_gain(original_runtime_ns=original_code_baseline.runtime, optimized_runtime_ns=opt.runtime) * 100)}%",
-                trace_id=trace_id,
-                original_line_profiler_results=original_code_baseline.line_profile_results["str_out"],
-                optimized_line_profiler_results=opt.line_profiler_test_results["str_out"],
-                function_references=function_references,
-            )
-            for opt in valid_optimizations
-        ]
-        return executor.submit(ai_service_client.optimize_python_code_refinement, request=request)
-
-    def repair_optimization(
-        self,
-        original_source_code: str,
-        modified_source_code: str,
-        test_diffs: list[TestDiff],
-        trace_id: str,
-        optimization_id: str,
-        ai_service_client: AiServiceClient,
-        executor: concurrent.futures.ThreadPoolExecutor,
-    ) -> concurrent.futures.Future[OptimizedCandidate | None]:
-        request = AIServiceCodeRepairRequest(
-            optimization_id=optimization_id,
-            original_source_code=original_source_code,
-            modified_source_code=modified_source_code,
-            test_diffs=test_diffs,
-            trace_id=trace_id,
-        )
-        return executor.submit(ai_service_client.code_repair, request=request)
-
     def log_successful_optimization(
         self, explanation: Explanation, generated_tests: GeneratedTestsList, exp_type: str
     ) -> None: