diff --git a/src/guidellm/benchmark/outputs/html.py b/src/guidellm/benchmark/outputs/html.py index 34cf71073..318d9d4de 100644 --- a/src/guidellm/benchmark/outputs/html.py +++ b/src/guidellm/benchmark/outputs/html.py @@ -29,7 +29,7 @@ GenerativeBenchmark, GenerativeBenchmarksReport, ) -from guidellm.schemas import DistributionSummary +from guidellm.schemas import DistributionSummary, Percentiles from guidellm.settings import settings from guidellm.utils import camelize_str, recursive_key_update from guidellm.utils.text import load_text @@ -190,6 +190,24 @@ def percentile_rows(self) -> list[dict[str, str | float]]: filter(lambda row: row["percentile"] in ["p50", "p90", "p95", "p99"], rows) ) + def model_dump(self, **kwargs) -> dict: + """ + Override model_dump to filter duplicate consecutive percentile values. + + This prevents visualization errors when distributions have limited data + points causing multiple percentiles to collapse to the same value. + + :param kwargs: Arguments to pass to parent model_dump + :return: Dictionary with filtered percentiles + """ + data = super().model_dump(**kwargs) + + if "percentiles" in data and data["percentiles"]: + filtered_percentiles = _filter_duplicate_percentiles(data["percentiles"]) + data["percentiles"] = filtered_percentiles + + return data + @classmethod def from_distribution_summary( cls, distribution: DistributionSummary @@ -222,6 +240,39 @@ def _create_html_report(js_data: dict[str, str], output_path: Path) -> Path: return output_path +def _filter_duplicate_percentiles(percentiles: dict[str, float]) -> dict[str, float]: + """ + Filter out consecutive duplicate percentile values. + + When distributions have very few data points, multiple percentiles can have + the same value, which causes visualization libraries to fail. This function + keeps only the largest percentile for consecutive duplicate values, which is + more mathematically accurate as higher percentiles have greater statistical + significance. + + :param percentiles: Dictionary of percentile names to values + :return: Filtered percentiles dictionary with no consecutive duplicates + """ + if not percentiles: + return percentiles + + percentile_order = list(Percentiles.model_fields.keys()) + + # Iterate in reverse to keep the largest percentile for each value + filtered = {} + previous_value = None + + for key in reversed(percentile_order): + if key in percentiles: + current_value = percentiles[key] + if previous_value is None or current_value != previous_value: + filtered[key] = current_value + previous_value = current_value + + # Restore original order + return {key: filtered[key] for key in percentile_order if key in filtered} + + def _inject_data(js_data: dict[str, str], html: str) -> str: """ Inject JavaScript data into HTML head section. diff --git a/tests/unit/benchmark/test_html_output.py b/tests/unit/benchmark/test_html_output.py new file mode 100644 index 000000000..39c46a763 --- /dev/null +++ b/tests/unit/benchmark/test_html_output.py @@ -0,0 +1,164 @@ +## WRITTEN BY AI ## +from guidellm.benchmark.outputs.html import _filter_duplicate_percentiles +from guidellm.schemas import Percentiles + + +def test_filter_all_same_values(): + """Test filtering when all percentiles have the same value.""" + percentiles = { + "p001": 15.288091352804853, + "p01": 15.288091352804853, + "p05": 15.288091352804853, + "p10": 15.288091352804853, + "p25": 15.288091352804853, + "p50": 15.288091352804853, + "p75": 15.288091352804853, + "p90": 15.288091352804853, + "p95": 15.288091352804853, + "p99": 15.288091352804853, + "p999": 15.288091352804853, + } + + filtered = _filter_duplicate_percentiles(percentiles) + + # Should only keep the largest (p999) for mathematical accuracy + assert filtered == {"p999": 15.288091352804853} + + +def test_filter_consecutive_duplicates(): + """Test filtering when some consecutive percentiles have the same value.""" + percentiles = { + "p001": 15.288091352804853, + "p01": 15.288091352804853, + "p05": 15.288091352804853, + "p10": 15.288091352804853, + "p25": 15.288091352804853, + "p50": 16.41327511776994, # Different value + "p75": 16.41327511776994, + "p90": 17.03541629998259, # Different value + "p95": 17.03541629998259, + "p99": 17.03541629998259, + "p999": 17.03541629998259, + } + + filtered = _filter_duplicate_percentiles(percentiles) + + # Should keep largest of each group for mathematical accuracy + assert filtered == { + "p25": 15.288091352804853, + "p75": 16.41327511776994, + "p999": 17.03541629998259, + } + + +def test_no_duplicates(): + """Test that unique values are all preserved.""" + percentiles = { + "p001": 13.181080445834912, + "p01": 13.181080445834912, # Same as p001 + "p05": 13.530595573836457, # Different + "p10": 13.843972502554365, + "p25": 14.086376978251748, + "p50": 14.403258051191058, + "p75": 14.738608817056042, + "p90": 15.18136631856698, + "p95": 15.7213110894772, + "p99": 15.7213110894772, # Same as p95 + "p999": 15.7213110894772, # Same as p99 + } + + filtered = _filter_duplicate_percentiles(percentiles) + + # Should keep largest of each duplicate group (e.g. p999 instead of p95) + assert filtered == { + "p01": 13.181080445834912, + "p05": 13.530595573836457, + "p10": 13.843972502554365, + "p25": 14.086376978251748, + "p50": 14.403258051191058, + "p75": 14.738608817056042, + "p90": 15.18136631856698, + "p999": 15.7213110894772, + } + + +def test_empty_percentiles(): + """Test with empty percentiles dictionary.""" + filtered = _filter_duplicate_percentiles({}) + assert filtered == {} + + +def test_single_percentile(): + """Test with only one percentile.""" + percentiles = {"p50": 14.403258051191058} + filtered = _filter_duplicate_percentiles(percentiles) + assert filtered == {"p50": 14.403258051191058} + + +def test_two_different_values(): + """Test with two different values.""" + percentiles = { + "p25": 14.086376978251748, + "p50": 14.403258051191058, + } + filtered = _filter_duplicate_percentiles(percentiles) + assert filtered == percentiles + + +def test_partial_percentiles(): + """Test that order is maintained even with partial percentiles.""" + percentiles = { + "p50": 16.41327511776994, + "p10": 15.288091352804853, + "p90": 17.03541629998259, + } + + filtered = _filter_duplicate_percentiles(percentiles) + + # Should maintain order from percentile_order list + assert list(filtered.keys()) == ["p10", "p50", "p90"] + + +def test_model_dump_filters_duplicates(): + """Test that model_dump applies percentile filtering.""" + from guidellm.benchmark.outputs.html import _TabularDistributionSummary + + # Create a distribution with duplicate percentiles (typical of small datasets) + dist = _TabularDistributionSummary( + mean=15.5, + median=15.288091352804853, + mode=15.288091352804853, + variance=0.1, + std_dev=0.316, + min=15.288091352804853, + max=17.03541629998259, + count=3, + total_sum=46.5, + percentiles=Percentiles( + p001=15.288091352804853, + p01=15.288091352804853, + p05=15.288091352804853, + p10=15.288091352804853, + p25=15.288091352804853, + p50=16.41327511776994, + p75=16.41327511776994, + p90=17.03541629998259, + p95=17.03541629998259, + p99=17.03541629998259, + p999=17.03541629998259, + ), + ) + + data = dist.model_dump() + + # Check that percentiles were filtered, keeping largest of each group + assert data["percentiles"] == { + "p25": 15.288091352804853, + "p75": 16.41327511776994, + "p999": 17.03541629998259, + } + + # Ensure other fields remain unchanged + assert data["mean"] == 15.5 + assert data["median"] == 15.288091352804853 + assert data["count"] == 3