From 2018784ad9dad9f7a875fa8882bfdded3f417528 Mon Sep 17 00:00:00 2001
From: Tobias Macey <tmacey@boundlessnotions.com>
Date: Fri, 17 Apr 2026 13:37:33 -0400
Subject: [PATCH 1/2] fix: handle URL length limits in Typesense engine for
 large courses

When indexing a large course with many content blocks, the Typesense
search engine could hit two related URL-length failures:

1. search(): Building an exclusion filter like
   'id:!=[id1, id2, ...]' for hundreds of item IDs produced a URL
   query string that exceeded httpx's internal limit, raising:
     httpx.InvalidURL: URL component 'query' too long
   An existing fallback for RequestMalformed ('Query string exceeds
   max allowed length') did not cover this client-side failure.
   Fix: also catch httpx.InvalidURL (narrowed to 'too long' message)
   and route both cases to the multi-search POST endpoint, which has
   no URL length constraint.

2. remove(): Deleting many stale documents built a single
   'id: [id1,id2,...]' filter sent as a DELETE query parameter, which
   is equally subject to URL length limits.
   Fix: chunk deletions into batches of _MAX_IDS_PER_DELETE_BATCH
   (100) IDs per request.

Also fix a latent bug in get_search_params(): the exclude_dictionary
loop iterated unconditionally, raising AttributeError when the caller
passed exclude_dictionary=None (the documented default).

Tests added for all three fixes in search/tests/test_typesense.py.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 search/tests/test_typesense.py | 231 +++++++++++++++++++++++++++++++++
 search/typesense.py            |  46 +++++--
 2 files changed, 266 insertions(+), 11 deletions(-)
 create mode 100644 search/tests/test_typesense.py

diff --git a/search/tests/test_typesense.py b/search/tests/test_typesense.py
new file mode 100644
index 00000000..a20fe66a
--- /dev/null
+++ b/search/tests/test_typesense.py
@@ -0,0 +1,231 @@
+"""
+Tests for the Typesense search engine.
+"""
+from unittest.mock import MagicMock, patch
+
+import django.test
+import httpx
+import pytest
+from django.test import override_settings
+from typesense.exceptions import RequestMalformed
+
+from search.typesense import (
+    TypesenseEngine,
+    _MAX_IDS_PER_DELETE_BATCH,
+    get_search_params,
+)
+
+TYPESENSE_SETTINGS = {
+    "TYPESENSE_API_KEY": "test-api-key",
+    "TYPESENSE_URLS": [{"host": "localhost", "port": "8108", "protocol": "http"}],
+    "TYPESENSE_COLLECTION_PREFIX": "test_",
+}
+
+
+# A minimal fake Typesense search response (as returned by the Python client).
+def _make_typesense_response(hits=None, found=None):
+    hits = hits or []
+    return {
+        "hits": hits,
+        "facet_counts": [],
+        "found": found if found is not None else len(hits),
+        "out_of": len(hits),
+        "page": 1,
+        "request_params": {"collection_name": "test_collection", "per_page": 10, "q": ""},
+        "search_cutoff": False,
+        "search_time_ms": 1,
+    }
+
+
+@override_settings(**TYPESENSE_SETTINGS)
+class GetSearchParamsTests(django.test.TestCase):
+    """Unit tests for get_search_params()."""
+
+    def test_none_exclude_dictionary_does_not_raise(self):
+        """Passing exclude_dictionary=None should not raise AttributeError."""
+        params = get_search_params(exclude_dictionary=None)
+        assert "filter_by" not in params
+
+    def test_empty_exclude_dictionary_produces_no_filter(self):
+        params = get_search_params(exclude_dictionary={})
+        assert "filter_by" not in params
+
+    def test_exclude_dictionary_single_field(self):
+        params = get_search_params(exclude_dictionary={"status": ["deleted"]})
+        assert params["filter_by"] == "status:!=[`deleted`]"
+
+    def test_exclude_dictionary_multiple_ids(self):
+        params = get_search_params(exclude_dictionary={"id": ["id1", "id2", "id3"]})
+        assert params["filter_by"] == "id:!=[`id1`, `id2`, `id3`]"
+
+    def test_field_dictionary_and_exclude_dictionary_combined(self):
+        params = get_search_params(
+            field_dictionary={"course": "course-v1:org+x+run"},
+            exclude_dictionary={"id": ["stale-id"]},
+        )
+        assert "course:=`course-v1:org+x+run`" in params["filter_by"]
+        assert "id:!=[`stale-id`]" in params["filter_by"]
+
+    def test_pagination_params(self):
+        params = get_search_params(from_=50, size=25)
+        assert params["offset"] == 50
+        assert params["limit"] == 25
+
+
+@override_settings(**TYPESENSE_SETTINGS)
+class TypesenseEngineSearchTests(django.test.TestCase):
+    """Tests for TypesenseEngine.search() fallback behaviour."""
+
+    def _make_engine(self, mock_index):
+        """Return a TypesenseEngine whose typesense_index is the given mock."""
+        engine = TypesenseEngine.__new__(TypesenseEngine)
+        engine.index_name = "courseware_content"
+        engine._typesense_index = mock_index  # pylint: disable=protected-access
+        return engine
+
+    def test_search_success(self):
+        """Normal search path returns processed results."""
+        mock_index = MagicMock()
+        mock_index.documents.search.return_value = _make_typesense_response(
+            hits=[{"document": {"id": "block-1"}, "highlights": [], "text_match": 100}],
+            found=1,
+        )
+        engine = self._make_engine(mock_index)
+        result = engine.search(field_dictionary={"course": "course-v1:org+x+run"})
+        assert result["total"] == 1
+        assert result["results"][0]["data"]["id"] == "block-1"
+
+    def test_search_falls_back_on_httpx_invalid_url(self):
+        """
+        When the Typesense client raises httpx.InvalidURL ('query too long'),
+        search() must fall back to the multi-search POST endpoint.
+        """
+        mock_index = MagicMock()
+        mock_index.documents.search.side_effect = httpx.InvalidURL(
+            "URL component 'query' too long"
+        )
+
+        expected_response = _make_typesense_response(found=0)
+        mock_client = MagicMock()
+        mock_client.multi_search.perform.return_value = {"results": [expected_response]}
+
+        engine = self._make_engine(mock_index)
+        engine._typesense_index = mock_index  # pylint: disable=protected-access
+
+        with patch("search.typesense.get_typesense_client", return_value=mock_client):
+            result = engine.search(field_dictionary={"course": "course-v1:org+x+run"})
+
+        assert mock_client.multi_search.perform.called
+        assert result["total"] == 0
+
+    def test_search_falls_back_on_request_malformed_too_long(self):
+        """
+        Existing behaviour: RequestMalformed('Query string exceeds max allowed length')
+        also routes to multi-search.
+        """
+        mock_index = MagicMock()
+        mock_index.documents.search.side_effect = RequestMalformed(
+            "Query string exceeds max allowed length of 4096"
+        )
+
+        expected_response = _make_typesense_response(found=0)
+        mock_client = MagicMock()
+        mock_client.multi_search.perform.return_value = {"results": [expected_response]}
+
+        engine = self._make_engine(mock_index)
+
+        with patch("search.typesense.get_typesense_client", return_value=mock_client):
+            result = engine.search(field_dictionary={"course": "course-v1:org+x+run"})
+
+        assert mock_client.multi_search.perform.called
+        assert result["total"] == 0
+
+    def test_search_does_not_swallow_unrelated_httpx_invalid_url(self):
+        """
+        An httpx.InvalidURL that is NOT about length should be re-raised, not
+        silently swallowed.
+        """
+        mock_index = MagicMock()
+        mock_index.documents.search.side_effect = httpx.InvalidURL(
+            "URL component 'host' is invalid"
+        )
+
+        engine = self._make_engine(mock_index)
+        with pytest.raises(httpx.InvalidURL):
+            engine.search(field_dictionary={"course": "course-v1:org+x+run"})
+
+    def test_search_does_not_swallow_unrelated_request_malformed(self):
+        """
+        A RequestMalformed that is not a length error should be re-raised.
+        """
+        mock_index = MagicMock()
+        mock_index.documents.search.side_effect = RequestMalformed(
+            "Some other malformed request error"
+        )
+
+        engine = self._make_engine(mock_index)
+        with pytest.raises(RequestMalformed):
+            engine.search(field_dictionary={"course": "course-v1:org+x+run"})
+
+
+@override_settings(**TYPESENSE_SETTINGS)
+class TypesenseEngineRemoveTests(django.test.TestCase):
+    """Tests for TypesenseEngine.remove() chunking."""
+
+    def _make_engine(self, mock_index):
+        engine = TypesenseEngine.__new__(TypesenseEngine)
+        engine.index_name = "courseware_content"
+        engine._typesense_index = mock_index  # pylint: disable=protected-access
+        return engine
+
+    def test_remove_empty_list_does_nothing(self):
+        mock_index = MagicMock()
+        engine = self._make_engine(mock_index)
+        engine.remove([])
+        mock_index.documents.delete.assert_not_called()
+
+    def test_remove_small_batch_single_request(self):
+        """IDs within the batch limit are deleted in one request."""
+        mock_index = MagicMock()
+        engine = self._make_engine(mock_index)
+        ids = [f"block-{i}" for i in range(10)]
+        engine.remove(ids)
+        assert mock_index.documents.delete.call_count == 1
+
+    def test_remove_large_batch_chunks_requests(self):
+        """
+        More IDs than _MAX_IDS_PER_DELETE_BATCH must be split across multiple
+        delete requests so no single request has a URL query string that is too long.
+        """
+        mock_index = MagicMock()
+        engine = self._make_engine(mock_index)
+        total_ids = _MAX_IDS_PER_DELETE_BATCH * 3 + 1  # forces 4 batches
+        ids = [f"block-{i}" for i in range(total_ids)]
+        engine.remove(ids)
+        assert mock_index.documents.delete.call_count == 4
+
+    def test_remove_exact_batch_size(self):
+        """Exactly _MAX_IDS_PER_DELETE_BATCH IDs fit in one request."""
+        mock_index = MagicMock()
+        engine = self._make_engine(mock_index)
+        ids = [f"block-{i}" for i in range(_MAX_IDS_PER_DELETE_BATCH)]
+        engine.remove(ids)
+        assert mock_index.documents.delete.call_count == 1
+
+    def test_remove_each_batch_contains_correct_ids(self):
+        """Each batch's filter_by contains only its own slice of IDs."""
+        mock_index = MagicMock()
+        engine = self._make_engine(mock_index)
+        ids = [f"id-{i}" for i in range(_MAX_IDS_PER_DELETE_BATCH + 2)]
+        engine.remove(ids)
+
+        calls = mock_index.documents.delete.call_args_list
+        assert len(calls) == 2
+
+        first_filter = calls[0][0][0]["filter_by"]
+        second_filter = calls[1][0][0]["filter_by"]
+
+        # First batch should contain exactly _MAX_IDS_PER_DELETE_BATCH entries
+        assert first_filter.count("`id-") == _MAX_IDS_PER_DELETE_BATCH
+        # Second batch should contain the remaining 2
+        assert second_filter.count("`id-") == 2
diff --git a/search/typesense.py b/search/typesense.py
index 7ee27461..b09d1a5f 100644
--- a/search/typesense.py
+++ b/search/typesense.py
@@ -38,6 +38,7 @@
 
 from django.conf import settings
 from django.utils import timezone
+import httpx
 import typesense
 from typesense.sync.collection import Collection  # moved in typesense-python 2.0
 from typesense.exceptions import ObjectNotFound, RequestMalformed, ServerError
@@ -105,6 +106,10 @@
 
 logger = logging.getLogger(__name__)
 
+# Maximum number of document IDs to include in a single filter-based delete request.
+# Larger batches produce a URL query string that exceeds httpx's length limit.
+_MAX_IDS_PER_DELETE_BATCH = 100
+
 
 class TypesenseEngine(SearchEngine):
     """
@@ -208,9 +213,21 @@ def search(
         search_args = {"q": query_string, "query_by": query_by, **compiled_params}
         try:
             results = self.typesense_index.documents.search(search_args)
-        except RequestMalformed as err:
-            if "Query string exceeds max allowed length" in str(err):
-                # To do large queries (with complex filters), we need to use the multi-search endpoint:
+        except (RequestMalformed, httpx.InvalidURL) as err:
+            # Two failure modes indicate the filter/query string is too long to send as a GET request:
+            # - httpx.InvalidURL is raised client-side (before the request is sent) when the assembled
+            #   URL query string exceeds httpx's internal limit.
+            # - RequestMalformed with "Query string exceeds max allowed length" is returned by the
+            #   Typesense server when the query string passes the client but is too long server-side.
+            # In both cases we fall back to the multi-search endpoint, which uses POST and has no
+            # URL length constraint.
+            is_url_too_long = (
+                isinstance(err, httpx.InvalidURL) and "too long" in str(err).lower()
+            ) or (
+                isinstance(err, RequestMalformed)
+                and "Query string exceeds max allowed length" in str(err)
+            )
+            if is_url_too_long:
                 client = get_typesense_client()
                 multi_response = client.multi_search.perform({"searches": [{
                     "collection": self.typesense_index_name,
@@ -232,8 +249,11 @@ def search(
 
     def remove(self, doc_ids, **kwargs):
         """
-        Removing documents from the index is as simple as deleting the the documents
-        with the corresponding primary key.
+        Remove documents from the index by primary key.
+
+        Deletes are sent in batches of at most _MAX_IDS_PER_DELETE_BATCH to keep the
+        generated filter_by query string within URL length limits, since the Typesense
+        client sends delete parameters as URL query parameters.
         """
         logger.info(
             "Remove request: index=%s, doc_ids=%s kwargs=%s",
@@ -241,10 +261,13 @@ def remove(self, doc_ids, **kwargs):
             doc_ids,
             kwargs,
         )
-        if doc_ids:
-            escaped_ids = [_escape_str(doc_id) for doc_id in doc_ids]
+        if not doc_ids:
+            return
+        escaped_ids = [_escape_str(doc_id) for doc_id in doc_ids]
+        for batch_start in range(0, len(escaped_ids), _MAX_IDS_PER_DELETE_BATCH):
+            batch = escaped_ids[batch_start:batch_start + _MAX_IDS_PER_DELETE_BATCH]
             self.typesense_index.documents.delete({
-                'filter_by': f'id: [{",".join(escaped_ids)}]'
+                'filter_by': f'id: [{",".join(batch)}]'
             })
 
 
@@ -360,9 +383,10 @@ def get_search_params(
         filters += get_filter_rules(field_dictionary)
     if filter_dictionary:
         filters += get_filter_rules(filter_dictionary, optional=True)
-    for key, values in exclude_dictionary.items():
-        # Ignore multiple values for this field, e.g. {"id": ["ignorethis1", "ignorethis2"]}
-        filters += [f"{key}:!=[{', '.join(_escape_str(value) for value in values)}]"]
+    if exclude_dictionary:
+        for key, values in exclude_dictionary.items():
+            # Ignore multiple values for this field, e.g. {"id": ["ignorethis1", "ignorethis2"]}
+            filters += [f"{key}:!=[{', '.join(_escape_str(value) for value in values)}]"]
     if filters:
         params["filter_by"] = " && ".join(filters)
 

From e4aa33e0dbe628d4647d91834643b65e9044adac Mon Sep 17 00:00:00 2001
From: Tobias Macey <tmacey@boundlessnotions.com>
Date: Fri, 17 Apr 2026 13:45:33 -0400
Subject: [PATCH 2/2] fix: apply Typesense filtering and data-type best
 practices

Three improvements based on a review of the Typesense filtering and
data-types documentation:

1. Use the documented '![]' (is-not-any-of) filter operator instead of
   '!=[...]'. The operators table in
   https://typesense.org/docs/guide/tips-for-filtering.html#available-operators
   documents '![]' as the canonical multi-value exclusion operator;
   '!=[...]' combines the single-value inequality operator with an array
   argument and is not an officially documented form.

2. Add range_index: True to all datetime fields (start, end,
   enrollment_start, enrollment_end, start_date). The schema-parameters
   reference documents this flag as enabling an optimised index for
   range-based numerical filtering, which is exactly how date fields are
   queried (e.g. 'show only active courses').

3. Strip HTML from content sub-fields before indexing. The Typesense
   data-types guide explicitly recommends storing plain text rather than
   raw HTML markup so that tags are not indexed as searchable tokens,
   wasting in-memory index space and polluting search results.
   A new _strip_html / _strip_html_from_content helper pair is applied
   inside process_document() only to the nested 'content' object, since
   other fields (course keys, org names, etc.) are clean identifiers
   that should not be modified.

Tests added for all three changes in search/tests/test_typesense.py.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 search/tests/test_typesense.py | 90 ++++++++++++++++++++++++++++++++--
 search/typesense.py            | 59 +++++++++++++++++++---
 2 files changed, 139 insertions(+), 10 deletions(-)

diff --git a/search/tests/test_typesense.py b/search/tests/test_typesense.py
index a20fe66a..9720e5fb 100644
--- a/search/tests/test_typesense.py
+++ b/search/tests/test_typesense.py
@@ -12,6 +12,8 @@
 from search.typesense import (
     TypesenseEngine,
     _MAX_IDS_PER_DELETE_BATCH,
+    _strip_html,
+    _strip_html_from_content,
     get_search_params,
 )
 
@@ -52,11 +54,11 @@ def test_empty_exclude_dictionary_produces_no_filter(self):
 
     def test_exclude_dictionary_single_field(self):
         params = get_search_params(exclude_dictionary={"status": ["deleted"]})
-        assert params["filter_by"] == "status:!=[`deleted`]"
+        assert params["filter_by"] == "status:![`deleted`]"
 
     def test_exclude_dictionary_multiple_ids(self):
         params = get_search_params(exclude_dictionary={"id": ["id1", "id2", "id3"]})
-        assert params["filter_by"] == "id:!=[`id1`, `id2`, `id3`]"
+        assert params["filter_by"] == "id:![`id1`, `id2`, `id3`]"
 
     def test_field_dictionary_and_exclude_dictionary_combined(self):
         params = get_search_params(
@@ -64,7 +66,7 @@ def test_field_dictionary_and_exclude_dictionary_combined(self):
             exclude_dictionary={"id": ["stale-id"]},
         )
         assert "course:=`course-v1:org+x+run`" in params["filter_by"]
-        assert "id:!=[`stale-id`]" in params["filter_by"]
+        assert "id:![`stale-id`]" in params["filter_by"]
 
     def test_pagination_params(self):
         params = get_search_params(from_=50, size=25)
@@ -229,3 +231,85 @@ def test_remove_each_batch_contains_correct_ids(self):
         assert first_filter.count("`id-") == _MAX_IDS_PER_DELETE_BATCH
         # Second batch should contain the remaining 2
         assert second_filter.count("`id-") == 2
+
+
+class StripHtmlTests(django.test.TestCase):
+    """Tests for the _strip_html and _strip_html_from_content helpers."""
+
+    def test_strip_html_removes_tags(self):
+        assert _strip_html("<p>Hello <strong>world</strong></p>") == "Hello world"
+
+    def test_strip_html_unescapes_entities(self):
+        assert _strip_html("&lt;b&gt;bold&lt;/b&gt; &amp; more") == "<b>bold</b> & more"
+
+    def test_strip_html_normalises_whitespace(self):
+        assert _strip_html("  foo   <br/>  bar  ") == "foo bar"
+
+    def test_strip_html_plain_text_unchanged(self):
+        text = "no html here"
+        assert _strip_html(text) == text
+
+    def test_strip_html_empty_string(self):
+        assert _strip_html("") == ""
+
+    def test_strip_html_from_content_dict(self):
+        content = {
+            "display_name": "<b>My Problem</b>",
+            "body": "<p>Solve for <em>x</em> where x &gt; 0.</p>",
+        }
+        result = _strip_html_from_content(content)
+        assert result == {
+            "display_name": "My Problem",
+            "body": "Solve for x where x > 0.",
+        }
+
+    def test_strip_html_from_content_nested(self):
+        content = {"section": {"title": "<h2>Unit 1</h2>", "items": ["<li>a</li>", "<li>b</li>"]}}
+        result = _strip_html_from_content(content)
+        assert result == {"section": {"title": "Unit 1", "items": ["a", "b"]}}
+
+    def test_strip_html_from_content_non_string_unchanged(self):
+        content = {"count": 42, "active": True}
+        assert _strip_html_from_content(content) == {"count": 42, "active": True}
+
+
+@override_settings(**TYPESENSE_SETTINGS)
+class ProcessDocumentTests(django.test.TestCase):
+    """Tests for TypesenseEngine.process_document()."""
+
+    def _make_engine(self):
+        engine = TypesenseEngine.__new__(TypesenseEngine)
+        engine.index_name = "courseware_content"
+        return engine
+
+    def test_html_stripped_from_content_field(self):
+        """HTML in content sub-fields is stripped before the document is indexed."""
+        engine = self._make_engine()
+        doc = {
+            "id": "block-v1:org+x+run+type@html+block@abc",
+            "content": {
+                "display_name": "<b>Intro</b>",
+                "body": "<p>Welcome to <em>this</em> course &amp; enjoy!</p>",
+            },
+        }
+        result = engine.process_document(doc)
+        assert result["content"]["display_name"] == "Intro"
+        assert result["content"]["body"] == "Welcome to this course & enjoy!"
+
+    def test_non_content_fields_not_stripped(self):
+        """Fields outside 'content' are not modified by HTML stripping."""
+        engine = self._make_engine()
+        doc = {
+            "id": "block-v1:org+x+run+type@html+block@abc",
+            "display_name": "<b>should not be stripped</b>",
+            "content": {"body": "plain"},
+        }
+        result = engine.process_document(doc)
+        assert result["display_name"] == "<b>should not be stripped</b>"
+
+    def test_missing_content_field_handled(self):
+        """Documents without a 'content' field are processed without error."""
+        engine = self._make_engine()
+        doc = {"id": "block-v1:org+x+run+type@problem+block@xyz"}
+        result = engine.process_document(doc)
+        assert result["id"] == "block-v1:org+x+run+type@problem+block@xyz"
diff --git a/search/typesense.py b/search/typesense.py
index b09d1a5f..ad5880a7 100644
--- a/search/typesense.py
+++ b/search/typesense.py
@@ -33,7 +33,9 @@
 For more information about the Typesense API in Python, check
 https://github.com/typesense/typesense-python
 """
+import html as html_module
 import logging
+import re
 import typing as t
 
 from django.conf import settings
@@ -69,10 +71,10 @@
     COURSE_INFO_INDEX: {
         # Most fields will be auto-created but some (datetimes, facet fields) need to be specified up front:
         "field_overrides": [
-            {"name": "start", "type": "datetime", "optional": True},
-            {"name": "end", "type": "datetime", "optional": True},
-            {"name": "enrollment_start", "type": "datetime", "optional": True},
-            {"name": "enrollment_end", "type": "datetime", "optional": True},
+            {"name": "start", "type": "datetime", "optional": True, "range_index": True},
+            {"name": "end", "type": "datetime", "optional": True, "range_index": True},
+            {"name": "enrollment_start", "type": "datetime", "optional": True, "range_index": True},
+            {"name": "enrollment_end", "type": "datetime", "optional": True, "range_index": True},
             {"name": "org", "type": "string", "facet": True},
             {"name": "modes", "type": "string[]", "facet": True},
             {"name": "language", "type": "string", "facet": True, "optional": True},
@@ -88,7 +90,7 @@
     COURSE_CONTENT_INDEX: {
         # Most fields will be auto-created but some (datetimes, facet fields) need to be specified up front:
         "field_overrides": [
-            {"name": "start_date", "type": "datetime", "optional": True},
+            {"name": "start_date", "type": "datetime", "optional": True, "range_index": True},
             # Enable stemming for the "content" field, so that e.g.
             # searching for "run" will match "running", "runs", "ran".
             # Unfortunately, this could break indexing if non-string fields get
@@ -175,6 +177,15 @@ def process_document(self, doc: dict[str, t.Any]) -> dict[str, t.Any]:
         for field_config in index_config.get("field_overrides", []):
             if field_config.get("optional", False):
                 doc_with_nulls.setdefault(field_config["name"])
+        # Strip HTML tags from the nested 'content' field. XBlock index_dictionary()
+        # methods may return raw HTML markup; indexing tags as tokens wastes memory
+        # and degrades search quality.
+        # See: https://typesense.org/docs/guide/tips-for-searching-common-types-of-data.html#html-content
+        if "content" in doc_with_nulls and isinstance(doc_with_nulls["content"], dict):
+            doc_with_nulls = {
+                **doc_with_nulls,
+                "content": _strip_html_from_content(doc_with_nulls["content"]),
+            }
         # Convert field types recursively, e.g. datetime->int64, NULL -> separate field:
         processed = convert_doc_datatypes(doc_with_nulls, record_nulls=True)
         return processed
@@ -354,6 +365,39 @@ def _escape_str(value: str):
     return f'`{value.replace("`", "")}`'
 
 
+_HTML_TAG_RE = re.compile(r"<[^>]+>")
+
+
+def _strip_html(text: str) -> str:
+    """
+    Remove HTML tags from *text* and unescape HTML entities.
+
+    Per the Typesense data-types guide, HTML content should be stripped to
+    plain text before indexing so that tags are not treated as searchable
+    tokens and do not bloat the in-memory index.
+    See: https://typesense.org/docs/guide/tips-for-searching-common-types-of-data.html#html-content
+    """
+    text = _HTML_TAG_RE.sub(" ", text)
+    text = html_module.unescape(text)
+    return " ".join(text.split())  # normalise whitespace
+
+
+def _strip_html_from_content(value: t.Any) -> t.Any:
+    """
+    Recursively walk *value* and strip HTML from any string leaves.
+
+    Intended for use only on the nested ``content`` field, where XBlock
+    ``index_dictionary()`` implementations may return raw HTML markup.
+    """
+    if isinstance(value, str):
+        return _strip_html(value)
+    if isinstance(value, dict):
+        return {k: _strip_html_from_content(v) for k, v in value.items()}
+    if isinstance(value, list):
+        return [_strip_html_from_content(item) for item in value]
+    return value
+
+
 def get_search_params(
     field_dictionary=None,
     filter_dictionary=None,
@@ -385,8 +429,9 @@ def get_search_params(
         filters += get_filter_rules(filter_dictionary, optional=True)
     if exclude_dictionary:
         for key, values in exclude_dictionary.items():
-            # Ignore multiple values for this field, e.g. {"id": ["ignorethis1", "ignorethis2"]}
-            filters += [f"{key}:!=[{', '.join(_escape_str(value) for value in values)}]"]
+            # Use the documented "is not any of" operator: field:![val1, val2, ...]
+            # (not `!=[...]`, which is the single-value inequality operator with an array arg)
+            filters += [f"{key}:![{', '.join(_escape_str(value) for value in values)}]"]
     if filters:
         params["filter_by"] = " && ".join(filters)