From 2018784ad9dad9f7a875fa8882bfdded3f417528 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Fri, 17 Apr 2026 13:37:33 -0400 Subject: [PATCH 1/2] fix: handle URL length limits in Typesense engine for large courses When indexing a large course with many content blocks, the Typesense search engine could hit two related URL-length failures: 1. search(): Building an exclusion filter like 'id:!=[id1, id2, ...]' for hundreds of item IDs produced a URL query string that exceeded httpx's internal limit, raising: httpx.InvalidURL: URL component 'query' too long An existing fallback for RequestMalformed ('Query string exceeds max allowed length') did not cover this client-side failure. Fix: also catch httpx.InvalidURL (narrowed to 'too long' message) and route both cases to the multi-search POST endpoint, which has no URL length constraint. 2. remove(): Deleting many stale documents built a single 'id: [id1,id2,...]' filter sent as a DELETE query parameter, which is equally subject to URL length limits. Fix: chunk deletions into batches of _MAX_IDS_PER_DELETE_BATCH (100) IDs per request. Also fix a latent bug in get_search_params(): the exclude_dictionary loop iterated unconditionally, raising AttributeError when the caller passed exclude_dictionary=None (the documented default). Tests added for all three fixes in search/tests/test_typesense.py. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- search/tests/test_typesense.py | 231 +++++++++++++++++++++++++++++++++ search/typesense.py | 46 +++++-- 2 files changed, 266 insertions(+), 11 deletions(-) create mode 100644 search/tests/test_typesense.py diff --git a/search/tests/test_typesense.py b/search/tests/test_typesense.py new file mode 100644 index 00000000..a20fe66a --- /dev/null +++ b/search/tests/test_typesense.py @@ -0,0 +1,231 @@ +""" +Tests for the Typesense search engine. +""" +from unittest.mock import MagicMock, patch + +import django.test +import httpx +import pytest +from django.test import override_settings +from typesense.exceptions import RequestMalformed + +from search.typesense import ( + TypesenseEngine, + _MAX_IDS_PER_DELETE_BATCH, + get_search_params, +) + +TYPESENSE_SETTINGS = { + "TYPESENSE_API_KEY": "test-api-key", + "TYPESENSE_URLS": [{"host": "localhost", "port": "8108", "protocol": "http"}], + "TYPESENSE_COLLECTION_PREFIX": "test_", +} + + +# A minimal fake Typesense search response (as returned by the Python client). +def _make_typesense_response(hits=None, found=None): + hits = hits or [] + return { + "hits": hits, + "facet_counts": [], + "found": found if found is not None else len(hits), + "out_of": len(hits), + "page": 1, + "request_params": {"collection_name": "test_collection", "per_page": 10, "q": ""}, + "search_cutoff": False, + "search_time_ms": 1, + } + + +@override_settings(**TYPESENSE_SETTINGS) +class GetSearchParamsTests(django.test.TestCase): + """Unit tests for get_search_params().""" + + def test_none_exclude_dictionary_does_not_raise(self): + """Passing exclude_dictionary=None should not raise AttributeError.""" + params = get_search_params(exclude_dictionary=None) + assert "filter_by" not in params + + def test_empty_exclude_dictionary_produces_no_filter(self): + params = get_search_params(exclude_dictionary={}) + assert "filter_by" not in params + + def test_exclude_dictionary_single_field(self): + params = get_search_params(exclude_dictionary={"status": ["deleted"]}) + assert params["filter_by"] == "status:!=[`deleted`]" + + def test_exclude_dictionary_multiple_ids(self): + params = get_search_params(exclude_dictionary={"id": ["id1", "id2", "id3"]}) + assert params["filter_by"] == "id:!=[`id1`, `id2`, `id3`]" + + def test_field_dictionary_and_exclude_dictionary_combined(self): + params = get_search_params( + field_dictionary={"course": "course-v1:org+x+run"}, + exclude_dictionary={"id": ["stale-id"]}, + ) + assert "course:=`course-v1:org+x+run`" in params["filter_by"] + assert "id:!=[`stale-id`]" in params["filter_by"] + + def test_pagination_params(self): + params = get_search_params(from_=50, size=25) + assert params["offset"] == 50 + assert params["limit"] == 25 + + +@override_settings(**TYPESENSE_SETTINGS) +class TypesenseEngineSearchTests(django.test.TestCase): + """Tests for TypesenseEngine.search() fallback behaviour.""" + + def _make_engine(self, mock_index): + """Return a TypesenseEngine whose typesense_index is the given mock.""" + engine = TypesenseEngine.__new__(TypesenseEngine) + engine.index_name = "courseware_content" + engine._typesense_index = mock_index # pylint: disable=protected-access + return engine + + def test_search_success(self): + """Normal search path returns processed results.""" + mock_index = MagicMock() + mock_index.documents.search.return_value = _make_typesense_response( + hits=[{"document": {"id": "block-1"}, "highlights": [], "text_match": 100}], + found=1, + ) + engine = self._make_engine(mock_index) + result = engine.search(field_dictionary={"course": "course-v1:org+x+run"}) + assert result["total"] == 1 + assert result["results"][0]["data"]["id"] == "block-1" + + def test_search_falls_back_on_httpx_invalid_url(self): + """ + When the Typesense client raises httpx.InvalidURL ('query too long'), + search() must fall back to the multi-search POST endpoint. + """ + mock_index = MagicMock() + mock_index.documents.search.side_effect = httpx.InvalidURL( + "URL component 'query' too long" + ) + + expected_response = _make_typesense_response(found=0) + mock_client = MagicMock() + mock_client.multi_search.perform.return_value = {"results": [expected_response]} + + engine = self._make_engine(mock_index) + engine._typesense_index = mock_index # pylint: disable=protected-access + + with patch("search.typesense.get_typesense_client", return_value=mock_client): + result = engine.search(field_dictionary={"course": "course-v1:org+x+run"}) + + assert mock_client.multi_search.perform.called + assert result["total"] == 0 + + def test_search_falls_back_on_request_malformed_too_long(self): + """ + Existing behaviour: RequestMalformed('Query string exceeds max allowed length') + also routes to multi-search. + """ + mock_index = MagicMock() + mock_index.documents.search.side_effect = RequestMalformed( + "Query string exceeds max allowed length of 4096" + ) + + expected_response = _make_typesense_response(found=0) + mock_client = MagicMock() + mock_client.multi_search.perform.return_value = {"results": [expected_response]} + + engine = self._make_engine(mock_index) + + with patch("search.typesense.get_typesense_client", return_value=mock_client): + result = engine.search(field_dictionary={"course": "course-v1:org+x+run"}) + + assert mock_client.multi_search.perform.called + assert result["total"] == 0 + + def test_search_does_not_swallow_unrelated_httpx_invalid_url(self): + """ + An httpx.InvalidURL that is NOT about length should be re-raised, not + silently swallowed. + """ + mock_index = MagicMock() + mock_index.documents.search.side_effect = httpx.InvalidURL( + "URL component 'host' is invalid" + ) + + engine = self._make_engine(mock_index) + with pytest.raises(httpx.InvalidURL): + engine.search(field_dictionary={"course": "course-v1:org+x+run"}) + + def test_search_does_not_swallow_unrelated_request_malformed(self): + """ + A RequestMalformed that is not a length error should be re-raised. + """ + mock_index = MagicMock() + mock_index.documents.search.side_effect = RequestMalformed( + "Some other malformed request error" + ) + + engine = self._make_engine(mock_index) + with pytest.raises(RequestMalformed): + engine.search(field_dictionary={"course": "course-v1:org+x+run"}) + + +@override_settings(**TYPESENSE_SETTINGS) +class TypesenseEngineRemoveTests(django.test.TestCase): + """Tests for TypesenseEngine.remove() chunking.""" + + def _make_engine(self, mock_index): + engine = TypesenseEngine.__new__(TypesenseEngine) + engine.index_name = "courseware_content" + engine._typesense_index = mock_index # pylint: disable=protected-access + return engine + + def test_remove_empty_list_does_nothing(self): + mock_index = MagicMock() + engine = self._make_engine(mock_index) + engine.remove([]) + mock_index.documents.delete.assert_not_called() + + def test_remove_small_batch_single_request(self): + """IDs within the batch limit are deleted in one request.""" + mock_index = MagicMock() + engine = self._make_engine(mock_index) + ids = [f"block-{i}" for i in range(10)] + engine.remove(ids) + assert mock_index.documents.delete.call_count == 1 + + def test_remove_large_batch_chunks_requests(self): + """ + More IDs than _MAX_IDS_PER_DELETE_BATCH must be split across multiple + delete requests so no single request has a URL query string that is too long. + """ + mock_index = MagicMock() + engine = self._make_engine(mock_index) + total_ids = _MAX_IDS_PER_DELETE_BATCH * 3 + 1 # forces 4 batches + ids = [f"block-{i}" for i in range(total_ids)] + engine.remove(ids) + assert mock_index.documents.delete.call_count == 4 + + def test_remove_exact_batch_size(self): + """Exactly _MAX_IDS_PER_DELETE_BATCH IDs fit in one request.""" + mock_index = MagicMock() + engine = self._make_engine(mock_index) + ids = [f"block-{i}" for i in range(_MAX_IDS_PER_DELETE_BATCH)] + engine.remove(ids) + assert mock_index.documents.delete.call_count == 1 + + def test_remove_each_batch_contains_correct_ids(self): + """Each batch's filter_by contains only its own slice of IDs.""" + mock_index = MagicMock() + engine = self._make_engine(mock_index) + ids = [f"id-{i}" for i in range(_MAX_IDS_PER_DELETE_BATCH + 2)] + engine.remove(ids) + + calls = mock_index.documents.delete.call_args_list + assert len(calls) == 2 + + first_filter = calls[0][0][0]["filter_by"] + second_filter = calls[1][0][0]["filter_by"] + + # First batch should contain exactly _MAX_IDS_PER_DELETE_BATCH entries + assert first_filter.count("`id-") == _MAX_IDS_PER_DELETE_BATCH + # Second batch should contain the remaining 2 + assert second_filter.count("`id-") == 2 diff --git a/search/typesense.py b/search/typesense.py index 7ee27461..b09d1a5f 100644 --- a/search/typesense.py +++ b/search/typesense.py @@ -38,6 +38,7 @@ from django.conf import settings from django.utils import timezone +import httpx import typesense from typesense.sync.collection import Collection # moved in typesense-python 2.0 from typesense.exceptions import ObjectNotFound, RequestMalformed, ServerError @@ -105,6 +106,10 @@ logger = logging.getLogger(__name__) +# Maximum number of document IDs to include in a single filter-based delete request. +# Larger batches produce a URL query string that exceeds httpx's length limit. +_MAX_IDS_PER_DELETE_BATCH = 100 + class TypesenseEngine(SearchEngine): """ @@ -208,9 +213,21 @@ def search( search_args = {"q": query_string, "query_by": query_by, **compiled_params} try: results = self.typesense_index.documents.search(search_args) - except RequestMalformed as err: - if "Query string exceeds max allowed length" in str(err): - # To do large queries (with complex filters), we need to use the multi-search endpoint: + except (RequestMalformed, httpx.InvalidURL) as err: + # Two failure modes indicate the filter/query string is too long to send as a GET request: + # - httpx.InvalidURL is raised client-side (before the request is sent) when the assembled + # URL query string exceeds httpx's internal limit. + # - RequestMalformed with "Query string exceeds max allowed length" is returned by the + # Typesense server when the query string passes the client but is too long server-side. + # In both cases we fall back to the multi-search endpoint, which uses POST and has no + # URL length constraint. + is_url_too_long = ( + isinstance(err, httpx.InvalidURL) and "too long" in str(err).lower() + ) or ( + isinstance(err, RequestMalformed) + and "Query string exceeds max allowed length" in str(err) + ) + if is_url_too_long: client = get_typesense_client() multi_response = client.multi_search.perform({"searches": [{ "collection": self.typesense_index_name, @@ -232,8 +249,11 @@ def search( def remove(self, doc_ids, **kwargs): """ - Removing documents from the index is as simple as deleting the the documents - with the corresponding primary key. + Remove documents from the index by primary key. + + Deletes are sent in batches of at most _MAX_IDS_PER_DELETE_BATCH to keep the + generated filter_by query string within URL length limits, since the Typesense + client sends delete parameters as URL query parameters. """ logger.info( "Remove request: index=%s, doc_ids=%s kwargs=%s", @@ -241,10 +261,13 @@ def remove(self, doc_ids, **kwargs): doc_ids, kwargs, ) - if doc_ids: - escaped_ids = [_escape_str(doc_id) for doc_id in doc_ids] + if not doc_ids: + return + escaped_ids = [_escape_str(doc_id) for doc_id in doc_ids] + for batch_start in range(0, len(escaped_ids), _MAX_IDS_PER_DELETE_BATCH): + batch = escaped_ids[batch_start:batch_start + _MAX_IDS_PER_DELETE_BATCH] self.typesense_index.documents.delete({ - 'filter_by': f'id: [{",".join(escaped_ids)}]' + 'filter_by': f'id: [{",".join(batch)}]' }) @@ -360,9 +383,10 @@ def get_search_params( filters += get_filter_rules(field_dictionary) if filter_dictionary: filters += get_filter_rules(filter_dictionary, optional=True) - for key, values in exclude_dictionary.items(): - # Ignore multiple values for this field, e.g. {"id": ["ignorethis1", "ignorethis2"]} - filters += [f"{key}:!=[{', '.join(_escape_str(value) for value in values)}]"] + if exclude_dictionary: + for key, values in exclude_dictionary.items(): + # Ignore multiple values for this field, e.g. {"id": ["ignorethis1", "ignorethis2"]} + filters += [f"{key}:!=[{', '.join(_escape_str(value) for value in values)}]"] if filters: params["filter_by"] = " && ".join(filters) From e4aa33e0dbe628d4647d91834643b65e9044adac Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Fri, 17 Apr 2026 13:45:33 -0400 Subject: [PATCH 2/2] fix: apply Typesense filtering and data-type best practices Three improvements based on a review of the Typesense filtering and data-types documentation: 1. Use the documented '![]' (is-not-any-of) filter operator instead of '!=[...]'. The operators table in https://typesense.org/docs/guide/tips-for-filtering.html#available-operators documents '![]' as the canonical multi-value exclusion operator; '!=[...]' combines the single-value inequality operator with an array argument and is not an officially documented form. 2. Add range_index: True to all datetime fields (start, end, enrollment_start, enrollment_end, start_date). The schema-parameters reference documents this flag as enabling an optimised index for range-based numerical filtering, which is exactly how date fields are queried (e.g. 'show only active courses'). 3. Strip HTML from content sub-fields before indexing. The Typesense data-types guide explicitly recommends storing plain text rather than raw HTML markup so that tags are not indexed as searchable tokens, wasting in-memory index space and polluting search results. A new _strip_html / _strip_html_from_content helper pair is applied inside process_document() only to the nested 'content' object, since other fields (course keys, org names, etc.) are clean identifiers that should not be modified. Tests added for all three changes in search/tests/test_typesense.py. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- search/tests/test_typesense.py | 90 ++++++++++++++++++++++++++++++++-- search/typesense.py | 59 +++++++++++++++++++--- 2 files changed, 139 insertions(+), 10 deletions(-) diff --git a/search/tests/test_typesense.py b/search/tests/test_typesense.py index a20fe66a..9720e5fb 100644 --- a/search/tests/test_typesense.py +++ b/search/tests/test_typesense.py @@ -12,6 +12,8 @@ from search.typesense import ( TypesenseEngine, _MAX_IDS_PER_DELETE_BATCH, + _strip_html, + _strip_html_from_content, get_search_params, ) @@ -52,11 +54,11 @@ def test_empty_exclude_dictionary_produces_no_filter(self): def test_exclude_dictionary_single_field(self): params = get_search_params(exclude_dictionary={"status": ["deleted"]}) - assert params["filter_by"] == "status:!=[`deleted`]" + assert params["filter_by"] == "status:![`deleted`]" def test_exclude_dictionary_multiple_ids(self): params = get_search_params(exclude_dictionary={"id": ["id1", "id2", "id3"]}) - assert params["filter_by"] == "id:!=[`id1`, `id2`, `id3`]" + assert params["filter_by"] == "id:![`id1`, `id2`, `id3`]" def test_field_dictionary_and_exclude_dictionary_combined(self): params = get_search_params( @@ -64,7 +66,7 @@ def test_field_dictionary_and_exclude_dictionary_combined(self): exclude_dictionary={"id": ["stale-id"]}, ) assert "course:=`course-v1:org+x+run`" in params["filter_by"] - assert "id:!=[`stale-id`]" in params["filter_by"] + assert "id:![`stale-id`]" in params["filter_by"] def test_pagination_params(self): params = get_search_params(from_=50, size=25) @@ -229,3 +231,85 @@ def test_remove_each_batch_contains_correct_ids(self): assert first_filter.count("`id-") == _MAX_IDS_PER_DELETE_BATCH # Second batch should contain the remaining 2 assert second_filter.count("`id-") == 2 + + +class StripHtmlTests(django.test.TestCase): + """Tests for the _strip_html and _strip_html_from_content helpers.""" + + def test_strip_html_removes_tags(self): + assert _strip_html("

Hello world

") == "Hello world" + + def test_strip_html_unescapes_entities(self): + assert _strip_html("<b>bold</b> & more") == "bold & more" + + def test_strip_html_normalises_whitespace(self): + assert _strip_html(" foo
bar ") == "foo bar" + + def test_strip_html_plain_text_unchanged(self): + text = "no html here" + assert _strip_html(text) == text + + def test_strip_html_empty_string(self): + assert _strip_html("") == "" + + def test_strip_html_from_content_dict(self): + content = { + "display_name": "My Problem", + "body": "

Solve for x where x > 0.

", + } + result = _strip_html_from_content(content) + assert result == { + "display_name": "My Problem", + "body": "Solve for x where x > 0.", + } + + def test_strip_html_from_content_nested(self): + content = {"section": {"title": "

Unit 1

", "items": ["
  • a
  • ", "
  • b
  • "]}} + result = _strip_html_from_content(content) + assert result == {"section": {"title": "Unit 1", "items": ["a", "b"]}} + + def test_strip_html_from_content_non_string_unchanged(self): + content = {"count": 42, "active": True} + assert _strip_html_from_content(content) == {"count": 42, "active": True} + + +@override_settings(**TYPESENSE_SETTINGS) +class ProcessDocumentTests(django.test.TestCase): + """Tests for TypesenseEngine.process_document().""" + + def _make_engine(self): + engine = TypesenseEngine.__new__(TypesenseEngine) + engine.index_name = "courseware_content" + return engine + + def test_html_stripped_from_content_field(self): + """HTML in content sub-fields is stripped before the document is indexed.""" + engine = self._make_engine() + doc = { + "id": "block-v1:org+x+run+type@html+block@abc", + "content": { + "display_name": "Intro", + "body": "

    Welcome to this course & enjoy!

    ", + }, + } + result = engine.process_document(doc) + assert result["content"]["display_name"] == "Intro" + assert result["content"]["body"] == "Welcome to this course & enjoy!" + + def test_non_content_fields_not_stripped(self): + """Fields outside 'content' are not modified by HTML stripping.""" + engine = self._make_engine() + doc = { + "id": "block-v1:org+x+run+type@html+block@abc", + "display_name": "should not be stripped", + "content": {"body": "plain"}, + } + result = engine.process_document(doc) + assert result["display_name"] == "should not be stripped" + + def test_missing_content_field_handled(self): + """Documents without a 'content' field are processed without error.""" + engine = self._make_engine() + doc = {"id": "block-v1:org+x+run+type@problem+block@xyz"} + result = engine.process_document(doc) + assert result["id"] == "block-v1:org+x+run+type@problem+block@xyz" diff --git a/search/typesense.py b/search/typesense.py index b09d1a5f..ad5880a7 100644 --- a/search/typesense.py +++ b/search/typesense.py @@ -33,7 +33,9 @@ For more information about the Typesense API in Python, check https://github.com/typesense/typesense-python """ +import html as html_module import logging +import re import typing as t from django.conf import settings @@ -69,10 +71,10 @@ COURSE_INFO_INDEX: { # Most fields will be auto-created but some (datetimes, facet fields) need to be specified up front: "field_overrides": [ - {"name": "start", "type": "datetime", "optional": True}, - {"name": "end", "type": "datetime", "optional": True}, - {"name": "enrollment_start", "type": "datetime", "optional": True}, - {"name": "enrollment_end", "type": "datetime", "optional": True}, + {"name": "start", "type": "datetime", "optional": True, "range_index": True}, + {"name": "end", "type": "datetime", "optional": True, "range_index": True}, + {"name": "enrollment_start", "type": "datetime", "optional": True, "range_index": True}, + {"name": "enrollment_end", "type": "datetime", "optional": True, "range_index": True}, {"name": "org", "type": "string", "facet": True}, {"name": "modes", "type": "string[]", "facet": True}, {"name": "language", "type": "string", "facet": True, "optional": True}, @@ -88,7 +90,7 @@ COURSE_CONTENT_INDEX: { # Most fields will be auto-created but some (datetimes, facet fields) need to be specified up front: "field_overrides": [ - {"name": "start_date", "type": "datetime", "optional": True}, + {"name": "start_date", "type": "datetime", "optional": True, "range_index": True}, # Enable stemming for the "content" field, so that e.g. # searching for "run" will match "running", "runs", "ran". # Unfortunately, this could break indexing if non-string fields get @@ -175,6 +177,15 @@ def process_document(self, doc: dict[str, t.Any]) -> dict[str, t.Any]: for field_config in index_config.get("field_overrides", []): if field_config.get("optional", False): doc_with_nulls.setdefault(field_config["name"]) + # Strip HTML tags from the nested 'content' field. XBlock index_dictionary() + # methods may return raw HTML markup; indexing tags as tokens wastes memory + # and degrades search quality. + # See: https://typesense.org/docs/guide/tips-for-searching-common-types-of-data.html#html-content + if "content" in doc_with_nulls and isinstance(doc_with_nulls["content"], dict): + doc_with_nulls = { + **doc_with_nulls, + "content": _strip_html_from_content(doc_with_nulls["content"]), + } # Convert field types recursively, e.g. datetime->int64, NULL -> separate field: processed = convert_doc_datatypes(doc_with_nulls, record_nulls=True) return processed @@ -354,6 +365,39 @@ def _escape_str(value: str): return f'`{value.replace("`", "")}`' +_HTML_TAG_RE = re.compile(r"<[^>]+>") + + +def _strip_html(text: str) -> str: + """ + Remove HTML tags from *text* and unescape HTML entities. + + Per the Typesense data-types guide, HTML content should be stripped to + plain text before indexing so that tags are not treated as searchable + tokens and do not bloat the in-memory index. + See: https://typesense.org/docs/guide/tips-for-searching-common-types-of-data.html#html-content + """ + text = _HTML_TAG_RE.sub(" ", text) + text = html_module.unescape(text) + return " ".join(text.split()) # normalise whitespace + + +def _strip_html_from_content(value: t.Any) -> t.Any: + """ + Recursively walk *value* and strip HTML from any string leaves. + + Intended for use only on the nested ``content`` field, where XBlock + ``index_dictionary()`` implementations may return raw HTML markup. + """ + if isinstance(value, str): + return _strip_html(value) + if isinstance(value, dict): + return {k: _strip_html_from_content(v) for k, v in value.items()} + if isinstance(value, list): + return [_strip_html_from_content(item) for item in value] + return value + + def get_search_params( field_dictionary=None, filter_dictionary=None, @@ -385,8 +429,9 @@ def get_search_params( filters += get_filter_rules(filter_dictionary, optional=True) if exclude_dictionary: for key, values in exclude_dictionary.items(): - # Ignore multiple values for this field, e.g. {"id": ["ignorethis1", "ignorethis2"]} - filters += [f"{key}:!=[{', '.join(_escape_str(value) for value in values)}]"] + # Use the documented "is not any of" operator: field:![val1, val2, ...] + # (not `!=[...]`, which is the single-value inequality operator with an array arg) + filters += [f"{key}:![{', '.join(_escape_str(value) for value in values)}]"] if filters: params["filter_by"] = " && ".join(filters)