Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 30 additions & 2 deletions chromadb/test/distributed/test_statistics_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import time
from typing import Any

import pytest

from chromadb.api.client import Client as ClientCreator
from chromadb.base_types import SparseVector
from chromadb.config import System
Expand Down Expand Up @@ -234,7 +236,7 @@ def test_statistics_wrapper_key_filter(basic_http_client: System) -> None:

# Get statistics filtered by "category" key only
category_stats = get_statistics(
collection, "key_filter_test_statistics", key="category"
collection, "key_filter_test_statistics", keys=["category"]
)
assert "category" in category_stats["statistics"]
assert "score" not in category_stats["statistics"]
Expand All @@ -246,7 +248,9 @@ def test_statistics_wrapper_key_filter(basic_http_client: System) -> None:
assert category_stats["summary"]["total_count"] == 3

# Get statistics filtered by "score" key only
score_stats = get_statistics(collection, "key_filter_test_statistics", key="score")
score_stats = get_statistics(
collection, "key_filter_test_statistics", keys=["score"]
)
assert "score" in score_stats["statistics"]
assert "category" not in score_stats["statistics"]
assert "active" not in score_stats["statistics"]
Expand All @@ -260,6 +264,30 @@ def test_statistics_wrapper_key_filter(basic_http_client: System) -> None:
detach_statistics_function(collection, delete_stats_collection=True)


def test_statistics_wrapper_key_filter_too_many_keys(basic_http_client: System) -> None:
"""Test that get_statistics raises ValueError when more than 30 keys are provided"""
client = ClientCreator.from_system(basic_http_client)
client.reset()

collection = client.create_collection(name="too_many_keys_test")

# Enable statistics
attach_statistics_function(collection, "too_many_keys_test_statistics")

# Generate more than 30 keys
too_many_keys = [f"key_{i}" for i in range(31)]

# Should raise ValueError when more than 30 keys are provided
with pytest.raises(ValueError) as exc_info:
get_statistics(collection, "too_many_keys_test_statistics", keys=too_many_keys)

assert "Too many keys provided: 31" in str(exc_info.value)
assert "Maximum allowed is 30" in str(exc_info.value)

# Cleanup
detach_statistics_function(collection, delete_stats_collection=True)


# commenting out for now as waiting for query cache invalidateion slows down the test suite
def test_statistics_wrapper_incremental_updates(basic_http_client: System) -> None:
"""Test that statistics are updated incrementally"""
Expand Down
32 changes: 25 additions & 7 deletions chromadb/utils/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from typing import TYPE_CHECKING, Optional, Dict, Any, cast
from collections import defaultdict

from chromadb.api.types import Where
from chromadb.api.types import OneOrMany, Where, maybe_cast_one_to_many

if TYPE_CHECKING:
from chromadb.api.models.Collection import Collection
Expand Down Expand Up @@ -121,7 +121,9 @@ def detach_statistics_function(


def get_statistics(
collection: "Collection", stats_collection_name: str, key: Optional[str] = None
collection: "Collection",
stats_collection_name: str,
keys: Optional[OneOrMany[str]] = None,
) -> Dict[str, Any]:
Comment on lines 123 to 127
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important

[Maintainability] Renaming the key parameter to keys is a breaking API change for any callers using keyword arguments. While the new name is more descriptive, this could be made backward-compatible to avoid breaking existing user code and provide a smoother transition.

If this breaking change is intentional, it should be clearly documented in the project's release notes. If not, you could consider handling the old key parameter with a deprecation warning.

Here's an example of how you could support both for a transition period by modifying the function signature and adding logic to the function body:

Signature:

def get_statistics(
    collection: "Collection",
    stats_collection_name: str,
    keys: Optional[OneOrMany[str]] = None,
    key: Optional[str] = None,  # Deprecated
) -> Dict[str, Any]:

Function Body Logic:

    if key is not None:
        import warnings
        warnings.warn(
            "The 'key' parameter is deprecated and will be removed in a future version. Please use 'keys' instead.",
            DeprecationWarning,
            stacklevel=2,
        )
        if keys is not None:
            raise ValueError("Cannot provide both 'key' and 'keys' arguments.")
        keys = key
Context for Agents
Renaming the `key` parameter to `keys` is a breaking API change for any callers using keyword arguments. While the new name is more descriptive, this could be made backward-compatible to avoid breaking existing user code and provide a smoother transition.

If this breaking change is intentional, it should be clearly documented in the project's release notes. If not, you could consider handling the old `key` parameter with a deprecation warning.

Here's an example of how you could support both for a transition period by modifying the function signature and adding logic to the function body:

**Signature:**
```python
def get_statistics(
    collection: "Collection",
    stats_collection_name: str,
    keys: Optional[OneOrMany[str]] = None,
    key: Optional[str] = None,  # Deprecated
) -> Dict[str, Any]:
```

**Function Body Logic:**
```python
    if key is not None:
        import warnings
        warnings.warn(
            "The 'key' parameter is deprecated and will be removed in a future version. Please use 'keys' instead.",
            DeprecationWarning,
            stacklevel=2,
        )
        if keys is not None:
            raise ValueError("Cannot provide both 'key' and 'keys' arguments.")
        keys = key
```

File: chromadb/utils/statistics.py
Line: 127

"""Get the current statistics for a collection.

Expand All @@ -131,8 +133,9 @@ def get_statistics(
Args:
collection: The collection to get statistics for
stats_collection_name: Name of the statistics collection to read from.
key: Optional metadata key to filter statistics for. If provided,
only returns statistics for that specific key.
keys: Optional metadata key(s) to filter statistics for. Can be a single key
string or a list of keys. If provided, only returns statistics for
those specific keys.

Returns:
Dict[str, Any]: A dictionary with the structure:
Expand Down Expand Up @@ -174,7 +177,22 @@ def get_statistics(
"total_count": 2
}
}

Raises:
ValueError: If more than 30 keys are provided in the keys filter.
"""
# Normalize keys to list
keys_list = maybe_cast_one_to_many(keys)

# Validate keys count to avoid issues with large $in queries
MAX_KEYS = 30
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Recommended

[Maintainability] Consider defining MAX_KEYS as a module-level constant (e.g., at the top of the file) instead of inside the function. This improves readability, makes it clear that this is a fixed configuration value, and avoids re-declaration on every function call.

Context for Agents
Consider defining `MAX_KEYS` as a module-level constant (e.g., at the top of the file) instead of inside the function. This improves readability, makes it clear that this is a fixed configuration value, and avoids re-declaration on every function call.

File: chromadb/utils/statistics.py
Line: 188

if keys_list is not None and len(keys_list) > MAX_KEYS:
raise ValueError(
f"Too many keys provided: {len(keys_list)}. "
f"Maximum allowed is {MAX_KEYS} keys per request. "
"Consider calling get_statistics multiple times with smaller key batches."
)

# Import here to avoid circular dependency
from chromadb.api.models.Collection import Collection

Expand All @@ -198,10 +216,10 @@ def get_statistics(
summary: Dict[str, Any] = {}

offset = 0
# When filtering by key, also include "summary" entries to get total_count
# When filtering by keys, also include "summary" entries to get total_count
where_filter: Optional[Where] = (
cast(Where, {"$or": [{"key": key}, {"key": "summary"}]})
if key is not None
cast(Where, {"key": {"$in": keys_list + ["summary"]}})
if keys_list is not None
else None
)

Expand Down