facebook · ltiao · Feb 10, 2026
diff --git a/ax/benchmark/benchmark.py b/ax/benchmark/benchmark.py
@@ -54,9 +54,13 @@
 from ax.generation_strategy.generation_strategy import GenerationStrategy
 from ax.orchestration.orchestrator import Orchestrator
 from ax.service.utils.best_point import (
+    _aggregate_and_cumulate_trace,
+    _compute_trace_values,
+    _pivot_data_with_feasibility,
     _prepare_data_for_trace,
     derelativize_opt_config,
     get_trace,
+    is_row_feasible,
 )
 from ax.service.utils.best_point_mixin import BestPointMixin
 from ax.service.utils.orchestrator_options import OrchestratorOptions, TrialType
@@ -791,64 +795,80 @@ def get_opt_trace_by_steps(experiment: Experiment) -> npt.NDArray:
     that is in terms of steps, with one element added each time a step
     completes.
 
+    Supports single-objective, multi-objective, and constrained problems.
+    For multi-objective problems, the trace is in terms of hypervolume.
+
     Args:
         experiment: An experiment produced by `benchmark_replication`; it must
             have `BenchmarkTrialMetadata` (as produced by `BenchmarkRunner`) for
             each trial, and its data must have a "step" column.
     """
     optimization_config = none_throws(experiment.optimization_config)
+    full_df = experiment.lookup_data().full_df
 
-    if optimization_config.is_moo_problem:
-        raise NotImplementedError(
-            "Cumulative epochs only supported for single objective problems."
-        )
-    if len(optimization_config.outcome_constraints) > 0:
-        raise NotImplementedError(
-            "Cumulative epochs not supported for problems with outcome constraints."
-        )
+    full_df["row_feasible"] = is_row_feasible(
+        df=full_df,
+        optimization_config=optimization_config,
+        # For the sake of this function, we only care about feasible trials. The
+        # distinction between infeasible and undetermined is not important.
+        undetermined_value=False,
+    )
 
-    objective_name = optimization_config.objective.metric.name
-    data = experiment.lookup_data()
-    full_df = data.full_df
+    # Pivot to wide format with feasibility
+    df_wide = _pivot_data_with_feasibility(
+        df=full_df,
+        index=["trial_index", "arm_name", MAP_KEY],
+        optimization_config=optimization_config,
+    )
 
-    # Has timestamps; needs to be merged with full_df because it contains
-    # data on epochs that didn't actually run due to early stopping, and we need
-    # to know which actually ran
-    def _get_df(trial: Trial) -> pd.DataFrame:
+    def _get_timestamps(experiment: Experiment) -> pd.Series:
         """
-        Get the (virtual) time each epoch finished at.
+        Get the (virtual) time at which each training progression finished.
         """
-        metadata = trial.run_metadata["benchmark_metadata"]
-        backend_simulator = none_throws(metadata.backend_simulator)
-        # Data for the first metric, which is the only metric
-        df = next(iter(metadata.dfs.values()))
-        start_time = backend_simulator.get_sim_trial_by_index(
-            trial.index
-        ).sim_start_time
-        df["time"] = df["virtual runtime"] + start_time
-        return df
-
-    with_timestamps = pd.concat(
-        (
-            _get_df(trial=assert_is_instance(trial, Trial))
-            for trial in experiment.trials.values()
-        ),
-        axis=0,
-        ignore_index=True,
-    )[["trial_index", MAP_KEY, "time"]]
-
-    df = (
-        full_df.loc[
-            full_df["metric_name"] == objective_name,
-            ["trial_index", "arm_name", "mean", MAP_KEY],
-        ]
-        .merge(with_timestamps, how="left")
-        .sort_values("time", ignore_index=True)
+        frames = []
+        for trial in experiment.trials.values():
+            trial = assert_is_instance(trial, Trial)
+            metadata = trial.run_metadata["benchmark_metadata"]
+            backend_simulator = none_throws(metadata.backend_simulator)
+            sim_trial = backend_simulator.get_sim_trial_by_index(
+                trial_index=trial.index
+            )
+            start_time = sim_trial.sim_start_time
+            # timestamps are identical across all metrics, so just use the first one
+            frame = next(iter(metadata.dfs.values())).copy()
+            frame["time"] = frame["virtual runtime"] + start_time
+            frames.append(frame)
+        df = pd.concat(frames, axis=0, ignore_index=True).set_index(
+            ["trial_index", "arm_name", MAP_KEY]
+        )
+        return df["time"]
+
+    # Compute timestamps and join with df_wide *before* cumulative computations.
+    # This is critical because cumulative HV/objective calculations depend on
+    # the temporal ordering of observations.
+    timestamps = _get_timestamps(experiment=experiment)
+
+    # Merge timestamps and sort by time before cumulative computations
+    df_wide = df_wide.join(
+        timestamps, on=["trial_index", "arm_name", MAP_KEY], how="left"
+    ).sort_values(by="time", ascending=True, ignore_index=True)
+
+    # Compute per-evaluation (trial_index, MAP_KEY) cumulative values,
+    # with keep_order=True to preserve ordering by timestamp
+    df_wide["value"], maximize = _compute_trace_values(
+        df_wide=df_wide,
+        optimization_config=optimization_config,
+        use_cumulative_best=True,
     )
-    return (
-        df["mean"].cummin()
-        if optimization_config.objective.minimize
-        else df["mean"].cummax()
+    # Get a value for each (trial_index, arm_name, MAP_KEY) tuple
+    value_by_arm_pull = df_wide[["trial_index", "arm_name", MAP_KEY, "value"]]
+
+    # Aggregate by trial and step, then compute cumulative best
+    return _aggregate_and_cumulate_trace(
+        df=value_by_arm_pull,
+        by=["trial_index", MAP_KEY],
+        maximize=maximize,
+        keep_order=True,
     ).to_numpy()
 
 
@@ -867,14 +887,15 @@ def get_benchmark_result_with_cumulative_steps(
     opt_trace = get_opt_trace_by_steps(experiment=experiment)
     return replace(
         result,
-        optimization_trace=opt_trace,
-        cost_trace=np.arange(1, len(opt_trace) + 1, dtype=int),
+        optimization_trace=opt_trace.tolist(),
+        cost_trace=np.arange(1, len(opt_trace) + 1, dtype=int).tolist(),
         # Empty
-        oracle_trace=np.full(len(opt_trace), np.nan),
-        inference_trace=np.full(len(opt_trace), np.nan),
+        oracle_trace=np.full_like(opt_trace, np.nan).tolist(),
+        inference_trace=np.full_like(opt_trace, np.nan).tolist(),
+        is_feasible_trace=None,
         score_trace=compute_score_trace(
             optimization_trace=opt_trace,
             baseline_value=baseline_value,
             optimal_value=optimal_value,
-        ),
+        ).tolist(),
     )
diff --git a/ax/benchmark/testing/benchmark_stubs.py b/ax/benchmark/testing/benchmark_stubs.py
@@ -312,15 +312,52 @@ def get_async_benchmark_problem(
     n_steps: int = 1,
     lower_is_better: bool = False,
     report_inference_value_as_trace: bool = False,
+    num_objectives: int = 1,
+    num_constraints: int = 0,
 ) -> BenchmarkProblem:
+    """
+    Create an early-stopping benchmark problem with MAP_KEY data.
+
+    Args:
+        map_data: Whether to use map metrics (required for early stopping).
+        step_runtime_fn: Optional runtime function for steps.
+        n_steps: Number of steps per trial.
+        lower_is_better: Whether lower values are better (for SOO).
+        report_inference_value_as_trace: Whether to report inference trace.
+        num_objectives: Number of objectives (1 for SOO, >1 for MOO).
+        num_constraints: Number of outcome constraints to add.
+
+    Returns:
+        A BenchmarkProblem suitable for early-stopping evaluation.
+    """
     search_space = get_discrete_search_space()
-    test_function = IdentityTestFunction(n_steps=n_steps)
-    optimization_config = get_soo_opt_config(
-        outcome_names=["objective"],
-        use_map_metric=map_data,
-        observe_noise_sd=True,
-        lower_is_better=lower_is_better,
-    )
+
+    # Create outcome names for objectives and constraints
+    objective_names = [f"objective_{i}" for i in range(num_objectives)]
+    constraint_names = [f"constraint_{i}" for i in range(num_constraints)]
+    outcome_names = [*objective_names, *constraint_names]
+
+    test_function = IdentityTestFunction(n_steps=n_steps, outcome_names=outcome_names)
+
+    if num_objectives == 1:
+        # Single-objective: first outcome is objective, rest are constraints
+        optimization_config = get_soo_opt_config(
+            outcome_names=outcome_names,
+            lower_is_better=lower_is_better,
+            observe_noise_sd=True,
+            use_map_metric=map_data,
+        )
+    else:
+        # Multi-objective: pass all outcomes (objectives + constraints)
+        # get_moo_opt_config will use the last num_constraints as constraints
+        optimization_config = get_moo_opt_config(
+            outcome_names=outcome_names,
+            ref_point=[1.0] * num_objectives,
+            num_constraints=num_constraints,
+            lower_is_better=lower_is_better,
+            observe_noise_sd=True,
+            use_map_metric=map_data,
+        )
 
     return BenchmarkProblem(
         name="test",
@@ -330,6 +367,7 @@ def get_async_benchmark_problem(
         num_trials=4,
         baseline_value=19 if lower_is_better else 0,
         optimal_value=0 if lower_is_better else 19,
+        worst_feasible_value=5.0 if num_constraints > 0 else None,
         step_runtime_function=step_runtime_fn,
         report_inference_value_as_trace=report_inference_value_as_trace,
     )

diff --git a/ax/benchmark/tests/test_benchmark.py b/ax/benchmark/tests/test_benchmark.py
@@ -1195,28 +1195,85 @@ def test_get_opt_trace_by_cumulative_epochs(self) -> None:
             new_opt_trace = get_opt_trace_by_steps(experiment=experiment)
             self.assertEqual(list(new_opt_trace), [0.0, 0.0, 1.0, 1.0, 2.0, 3.0])
 
-        method = get_sobol_benchmark_method()
-        with self.subTest("MOO"):
-            problem = get_multi_objective_benchmark_problem()
-
+        with self.subTest("Multi-objective"):
+            # Multi-objective problem with step data
+            problem = get_async_benchmark_problem(
+                map_data=True,
+                n_steps=5,
+                num_objectives=2,
+                # Ensure we don't have two finishing at the same time, for
+                # determinism
+                step_runtime_fn=lambda params: params["x0"] * (1 - 0.01 * params["x0"]),
+            )
             experiment = self.run_optimization_with_orchestrator(
                 problem=problem, method=method, seed=0
             )
-            with self.assertRaisesRegex(
-                NotImplementedError, "only supported for single objective"
-            ):
-                get_opt_trace_by_steps(experiment=experiment)
+            new_opt_trace = get_opt_trace_by_steps(experiment=experiment)
+            self.assertListEqual(
+                new_opt_trace.tolist(),
+                [
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    1.0,
+                    1.0,
+                    1.0,
+                    1.0,
+                    1.0,
+                    1.0,
+                    4.0,
+                    4.0,
+                    4.0,
+                    4.0,
+                    4.0,
+                    4.0,
+                    4.0,
+                ],
+            )
 
         with self.subTest("Constrained"):
-            problem = get_benchmark_problem("constrained_gramacy_observed_noise")
+            # Constrained problem with step data.
+            problem = get_async_benchmark_problem(
+                map_data=True,
+                n_steps=5,
+                num_constraints=1,
+                # Ensure we don't have two finishing at the same time, for
+                # determinism
+                step_runtime_fn=lambda params: params["x0"] * (1 - 0.01 * params["x0"]),
+            )
             experiment = self.run_optimization_with_orchestrator(
                 problem=problem, method=method, seed=0
             )
-            with self.assertRaisesRegex(
-                NotImplementedError,
-                "not supported for problems with outcome constraints",
-            ):
-                get_opt_trace_by_steps(experiment=experiment)
+            new_opt_trace = get_opt_trace_by_steps(experiment=experiment)
+            self.assertListEqual(
+                new_opt_trace.tolist(),
+                [
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    1.0,
+                    1.0,
+                    2.0,
+                    2.0,
+                    2.0,
+                    2.0,
+                    2.0,
+                    2.0,
+                    3.0,
+                    3.0,
+                    3.0,
+                    3.0,
+                    3.0,
+                    3.0,
+                    3.0,
+                ],
+            )
 
     def test_get_benchmark_result_with_cumulative_steps(self) -> None:
         """See test_get_opt_trace_by_cumulative_epochs for more info."""