diff --git a/search/tests/test_typesense.py b/search/tests/test_typesense.py new file mode 100644 index 00000000..9720e5fb --- /dev/null +++ b/search/tests/test_typesense.py @@ -0,0 +1,315 @@ +""" +Tests for the Typesense search engine. +""" +from unittest.mock import MagicMock, patch + +import django.test +import httpx +import pytest +from django.test import override_settings +from typesense.exceptions import RequestMalformed + +from search.typesense import ( + TypesenseEngine, + _MAX_IDS_PER_DELETE_BATCH, + _strip_html, + _strip_html_from_content, + get_search_params, +) + +TYPESENSE_SETTINGS = { + "TYPESENSE_API_KEY": "test-api-key", + "TYPESENSE_URLS": [{"host": "localhost", "port": "8108", "protocol": "http"}], + "TYPESENSE_COLLECTION_PREFIX": "test_", +} + + +# A minimal fake Typesense search response (as returned by the Python client). +def _make_typesense_response(hits=None, found=None): + hits = hits or [] + return { + "hits": hits, + "facet_counts": [], + "found": found if found is not None else len(hits), + "out_of": len(hits), + "page": 1, + "request_params": {"collection_name": "test_collection", "per_page": 10, "q": ""}, + "search_cutoff": False, + "search_time_ms": 1, + } + + +@override_settings(**TYPESENSE_SETTINGS) +class GetSearchParamsTests(django.test.TestCase): + """Unit tests for get_search_params().""" + + def test_none_exclude_dictionary_does_not_raise(self): + """Passing exclude_dictionary=None should not raise AttributeError.""" + params = get_search_params(exclude_dictionary=None) + assert "filter_by" not in params + + def test_empty_exclude_dictionary_produces_no_filter(self): + params = get_search_params(exclude_dictionary={}) + assert "filter_by" not in params + + def test_exclude_dictionary_single_field(self): + params = get_search_params(exclude_dictionary={"status": ["deleted"]}) + assert params["filter_by"] == "status:![`deleted`]" + + def test_exclude_dictionary_multiple_ids(self): + params = get_search_params(exclude_dictionary={"id": ["id1", "id2", "id3"]}) + assert params["filter_by"] == "id:![`id1`, `id2`, `id3`]" + + def test_field_dictionary_and_exclude_dictionary_combined(self): + params = get_search_params( + field_dictionary={"course": "course-v1:org+x+run"}, + exclude_dictionary={"id": ["stale-id"]}, + ) + assert "course:=`course-v1:org+x+run`" in params["filter_by"] + assert "id:![`stale-id`]" in params["filter_by"] + + def test_pagination_params(self): + params = get_search_params(from_=50, size=25) + assert params["offset"] == 50 + assert params["limit"] == 25 + + +@override_settings(**TYPESENSE_SETTINGS) +class TypesenseEngineSearchTests(django.test.TestCase): + """Tests for TypesenseEngine.search() fallback behaviour.""" + + def _make_engine(self, mock_index): + """Return a TypesenseEngine whose typesense_index is the given mock.""" + engine = TypesenseEngine.__new__(TypesenseEngine) + engine.index_name = "courseware_content" + engine._typesense_index = mock_index # pylint: disable=protected-access + return engine + + def test_search_success(self): + """Normal search path returns processed results.""" + mock_index = MagicMock() + mock_index.documents.search.return_value = _make_typesense_response( + hits=[{"document": {"id": "block-1"}, "highlights": [], "text_match": 100}], + found=1, + ) + engine = self._make_engine(mock_index) + result = engine.search(field_dictionary={"course": "course-v1:org+x+run"}) + assert result["total"] == 1 + assert result["results"][0]["data"]["id"] == "block-1" + + def test_search_falls_back_on_httpx_invalid_url(self): + """ + When the Typesense client raises httpx.InvalidURL ('query too long'), + search() must fall back to the multi-search POST endpoint. + """ + mock_index = MagicMock() + mock_index.documents.search.side_effect = httpx.InvalidURL( + "URL component 'query' too long" + ) + + expected_response = _make_typesense_response(found=0) + mock_client = MagicMock() + mock_client.multi_search.perform.return_value = {"results": [expected_response]} + + engine = self._make_engine(mock_index) + engine._typesense_index = mock_index # pylint: disable=protected-access + + with patch("search.typesense.get_typesense_client", return_value=mock_client): + result = engine.search(field_dictionary={"course": "course-v1:org+x+run"}) + + assert mock_client.multi_search.perform.called + assert result["total"] == 0 + + def test_search_falls_back_on_request_malformed_too_long(self): + """ + Existing behaviour: RequestMalformed('Query string exceeds max allowed length') + also routes to multi-search. + """ + mock_index = MagicMock() + mock_index.documents.search.side_effect = RequestMalformed( + "Query string exceeds max allowed length of 4096" + ) + + expected_response = _make_typesense_response(found=0) + mock_client = MagicMock() + mock_client.multi_search.perform.return_value = {"results": [expected_response]} + + engine = self._make_engine(mock_index) + + with patch("search.typesense.get_typesense_client", return_value=mock_client): + result = engine.search(field_dictionary={"course": "course-v1:org+x+run"}) + + assert mock_client.multi_search.perform.called + assert result["total"] == 0 + + def test_search_does_not_swallow_unrelated_httpx_invalid_url(self): + """ + An httpx.InvalidURL that is NOT about length should be re-raised, not + silently swallowed. + """ + mock_index = MagicMock() + mock_index.documents.search.side_effect = httpx.InvalidURL( + "URL component 'host' is invalid" + ) + + engine = self._make_engine(mock_index) + with pytest.raises(httpx.InvalidURL): + engine.search(field_dictionary={"course": "course-v1:org+x+run"}) + + def test_search_does_not_swallow_unrelated_request_malformed(self): + """ + A RequestMalformed that is not a length error should be re-raised. + """ + mock_index = MagicMock() + mock_index.documents.search.side_effect = RequestMalformed( + "Some other malformed request error" + ) + + engine = self._make_engine(mock_index) + with pytest.raises(RequestMalformed): + engine.search(field_dictionary={"course": "course-v1:org+x+run"}) + + +@override_settings(**TYPESENSE_SETTINGS) +class TypesenseEngineRemoveTests(django.test.TestCase): + """Tests for TypesenseEngine.remove() chunking.""" + + def _make_engine(self, mock_index): + engine = TypesenseEngine.__new__(TypesenseEngine) + engine.index_name = "courseware_content" + engine._typesense_index = mock_index # pylint: disable=protected-access + return engine + + def test_remove_empty_list_does_nothing(self): + mock_index = MagicMock() + engine = self._make_engine(mock_index) + engine.remove([]) + mock_index.documents.delete.assert_not_called() + + def test_remove_small_batch_single_request(self): + """IDs within the batch limit are deleted in one request.""" + mock_index = MagicMock() + engine = self._make_engine(mock_index) + ids = [f"block-{i}" for i in range(10)] + engine.remove(ids) + assert mock_index.documents.delete.call_count == 1 + + def test_remove_large_batch_chunks_requests(self): + """ + More IDs than _MAX_IDS_PER_DELETE_BATCH must be split across multiple + delete requests so no single request has a URL query string that is too long. + """ + mock_index = MagicMock() + engine = self._make_engine(mock_index) + total_ids = _MAX_IDS_PER_DELETE_BATCH * 3 + 1 # forces 4 batches + ids = [f"block-{i}" for i in range(total_ids)] + engine.remove(ids) + assert mock_index.documents.delete.call_count == 4 + + def test_remove_exact_batch_size(self): + """Exactly _MAX_IDS_PER_DELETE_BATCH IDs fit in one request.""" + mock_index = MagicMock() + engine = self._make_engine(mock_index) + ids = [f"block-{i}" for i in range(_MAX_IDS_PER_DELETE_BATCH)] + engine.remove(ids) + assert mock_index.documents.delete.call_count == 1 + + def test_remove_each_batch_contains_correct_ids(self): + """Each batch's filter_by contains only its own slice of IDs.""" + mock_index = MagicMock() + engine = self._make_engine(mock_index) + ids = [f"id-{i}" for i in range(_MAX_IDS_PER_DELETE_BATCH + 2)] + engine.remove(ids) + + calls = mock_index.documents.delete.call_args_list + assert len(calls) == 2 + + first_filter = calls[0][0][0]["filter_by"] + second_filter = calls[1][0][0]["filter_by"] + + # First batch should contain exactly _MAX_IDS_PER_DELETE_BATCH entries + assert first_filter.count("`id-") == _MAX_IDS_PER_DELETE_BATCH + # Second batch should contain the remaining 2 + assert second_filter.count("`id-") == 2 + + +class StripHtmlTests(django.test.TestCase): + """Tests for the _strip_html and _strip_html_from_content helpers.""" + + def test_strip_html_removes_tags(self): + assert _strip_html("

Hello world

") == "Hello world" + + def test_strip_html_unescapes_entities(self): + assert _strip_html("<b>bold</b> & more") == "bold & more" + + def test_strip_html_normalises_whitespace(self): + assert _strip_html(" foo
bar ") == "foo bar" + + def test_strip_html_plain_text_unchanged(self): + text = "no html here" + assert _strip_html(text) == text + + def test_strip_html_empty_string(self): + assert _strip_html("") == "" + + def test_strip_html_from_content_dict(self): + content = { + "display_name": "My Problem", + "body": "

Solve for x where x > 0.

", + } + result = _strip_html_from_content(content) + assert result == { + "display_name": "My Problem", + "body": "Solve for x where x > 0.", + } + + def test_strip_html_from_content_nested(self): + content = {"section": {"title": "

Unit 1

", "items": ["
  • a
  • ", "
  • b
  • "]}} + result = _strip_html_from_content(content) + assert result == {"section": {"title": "Unit 1", "items": ["a", "b"]}} + + def test_strip_html_from_content_non_string_unchanged(self): + content = {"count": 42, "active": True} + assert _strip_html_from_content(content) == {"count": 42, "active": True} + + +@override_settings(**TYPESENSE_SETTINGS) +class ProcessDocumentTests(django.test.TestCase): + """Tests for TypesenseEngine.process_document().""" + + def _make_engine(self): + engine = TypesenseEngine.__new__(TypesenseEngine) + engine.index_name = "courseware_content" + return engine + + def test_html_stripped_from_content_field(self): + """HTML in content sub-fields is stripped before the document is indexed.""" + engine = self._make_engine() + doc = { + "id": "block-v1:org+x+run+type@html+block@abc", + "content": { + "display_name": "Intro", + "body": "

    Welcome to this course & enjoy!

    ", + }, + } + result = engine.process_document(doc) + assert result["content"]["display_name"] == "Intro" + assert result["content"]["body"] == "Welcome to this course & enjoy!" + + def test_non_content_fields_not_stripped(self): + """Fields outside 'content' are not modified by HTML stripping.""" + engine = self._make_engine() + doc = { + "id": "block-v1:org+x+run+type@html+block@abc", + "display_name": "should not be stripped", + "content": {"body": "plain"}, + } + result = engine.process_document(doc) + assert result["display_name"] == "should not be stripped" + + def test_missing_content_field_handled(self): + """Documents without a 'content' field are processed without error.""" + engine = self._make_engine() + doc = {"id": "block-v1:org+x+run+type@problem+block@xyz"} + result = engine.process_document(doc) + assert result["id"] == "block-v1:org+x+run+type@problem+block@xyz" diff --git a/search/typesense.py b/search/typesense.py index 7ee27461..ad5880a7 100644 --- a/search/typesense.py +++ b/search/typesense.py @@ -33,11 +33,14 @@ For more information about the Typesense API in Python, check https://github.com/typesense/typesense-python """ +import html as html_module import logging +import re import typing as t from django.conf import settings from django.utils import timezone +import httpx import typesense from typesense.sync.collection import Collection # moved in typesense-python 2.0 from typesense.exceptions import ObjectNotFound, RequestMalformed, ServerError @@ -68,10 +71,10 @@ COURSE_INFO_INDEX: { # Most fields will be auto-created but some (datetimes, facet fields) need to be specified up front: "field_overrides": [ - {"name": "start", "type": "datetime", "optional": True}, - {"name": "end", "type": "datetime", "optional": True}, - {"name": "enrollment_start", "type": "datetime", "optional": True}, - {"name": "enrollment_end", "type": "datetime", "optional": True}, + {"name": "start", "type": "datetime", "optional": True, "range_index": True}, + {"name": "end", "type": "datetime", "optional": True, "range_index": True}, + {"name": "enrollment_start", "type": "datetime", "optional": True, "range_index": True}, + {"name": "enrollment_end", "type": "datetime", "optional": True, "range_index": True}, {"name": "org", "type": "string", "facet": True}, {"name": "modes", "type": "string[]", "facet": True}, {"name": "language", "type": "string", "facet": True, "optional": True}, @@ -87,7 +90,7 @@ COURSE_CONTENT_INDEX: { # Most fields will be auto-created but some (datetimes, facet fields) need to be specified up front: "field_overrides": [ - {"name": "start_date", "type": "datetime", "optional": True}, + {"name": "start_date", "type": "datetime", "optional": True, "range_index": True}, # Enable stemming for the "content" field, so that e.g. # searching for "run" will match "running", "runs", "ran". # Unfortunately, this could break indexing if non-string fields get @@ -105,6 +108,10 @@ logger = logging.getLogger(__name__) +# Maximum number of document IDs to include in a single filter-based delete request. +# Larger batches produce a URL query string that exceeds httpx's length limit. +_MAX_IDS_PER_DELETE_BATCH = 100 + class TypesenseEngine(SearchEngine): """ @@ -170,6 +177,15 @@ def process_document(self, doc: dict[str, t.Any]) -> dict[str, t.Any]: for field_config in index_config.get("field_overrides", []): if field_config.get("optional", False): doc_with_nulls.setdefault(field_config["name"]) + # Strip HTML tags from the nested 'content' field. XBlock index_dictionary() + # methods may return raw HTML markup; indexing tags as tokens wastes memory + # and degrades search quality. + # See: https://typesense.org/docs/guide/tips-for-searching-common-types-of-data.html#html-content + if "content" in doc_with_nulls and isinstance(doc_with_nulls["content"], dict): + doc_with_nulls = { + **doc_with_nulls, + "content": _strip_html_from_content(doc_with_nulls["content"]), + } # Convert field types recursively, e.g. datetime->int64, NULL -> separate field: processed = convert_doc_datatypes(doc_with_nulls, record_nulls=True) return processed @@ -208,9 +224,21 @@ def search( search_args = {"q": query_string, "query_by": query_by, **compiled_params} try: results = self.typesense_index.documents.search(search_args) - except RequestMalformed as err: - if "Query string exceeds max allowed length" in str(err): - # To do large queries (with complex filters), we need to use the multi-search endpoint: + except (RequestMalformed, httpx.InvalidURL) as err: + # Two failure modes indicate the filter/query string is too long to send as a GET request: + # - httpx.InvalidURL is raised client-side (before the request is sent) when the assembled + # URL query string exceeds httpx's internal limit. + # - RequestMalformed with "Query string exceeds max allowed length" is returned by the + # Typesense server when the query string passes the client but is too long server-side. + # In both cases we fall back to the multi-search endpoint, which uses POST and has no + # URL length constraint. + is_url_too_long = ( + isinstance(err, httpx.InvalidURL) and "too long" in str(err).lower() + ) or ( + isinstance(err, RequestMalformed) + and "Query string exceeds max allowed length" in str(err) + ) + if is_url_too_long: client = get_typesense_client() multi_response = client.multi_search.perform({"searches": [{ "collection": self.typesense_index_name, @@ -232,8 +260,11 @@ def search( def remove(self, doc_ids, **kwargs): """ - Removing documents from the index is as simple as deleting the the documents - with the corresponding primary key. + Remove documents from the index by primary key. + + Deletes are sent in batches of at most _MAX_IDS_PER_DELETE_BATCH to keep the + generated filter_by query string within URL length limits, since the Typesense + client sends delete parameters as URL query parameters. """ logger.info( "Remove request: index=%s, doc_ids=%s kwargs=%s", @@ -241,10 +272,13 @@ def remove(self, doc_ids, **kwargs): doc_ids, kwargs, ) - if doc_ids: - escaped_ids = [_escape_str(doc_id) for doc_id in doc_ids] + if not doc_ids: + return + escaped_ids = [_escape_str(doc_id) for doc_id in doc_ids] + for batch_start in range(0, len(escaped_ids), _MAX_IDS_PER_DELETE_BATCH): + batch = escaped_ids[batch_start:batch_start + _MAX_IDS_PER_DELETE_BATCH] self.typesense_index.documents.delete({ - 'filter_by': f'id: [{",".join(escaped_ids)}]' + 'filter_by': f'id: [{",".join(batch)}]' }) @@ -331,6 +365,39 @@ def _escape_str(value: str): return f'`{value.replace("`", "")}`' +_HTML_TAG_RE = re.compile(r"<[^>]+>") + + +def _strip_html(text: str) -> str: + """ + Remove HTML tags from *text* and unescape HTML entities. + + Per the Typesense data-types guide, HTML content should be stripped to + plain text before indexing so that tags are not treated as searchable + tokens and do not bloat the in-memory index. + See: https://typesense.org/docs/guide/tips-for-searching-common-types-of-data.html#html-content + """ + text = _HTML_TAG_RE.sub(" ", text) + text = html_module.unescape(text) + return " ".join(text.split()) # normalise whitespace + + +def _strip_html_from_content(value: t.Any) -> t.Any: + """ + Recursively walk *value* and strip HTML from any string leaves. + + Intended for use only on the nested ``content`` field, where XBlock + ``index_dictionary()`` implementations may return raw HTML markup. + """ + if isinstance(value, str): + return _strip_html(value) + if isinstance(value, dict): + return {k: _strip_html_from_content(v) for k, v in value.items()} + if isinstance(value, list): + return [_strip_html_from_content(item) for item in value] + return value + + def get_search_params( field_dictionary=None, filter_dictionary=None, @@ -360,9 +427,11 @@ def get_search_params( filters += get_filter_rules(field_dictionary) if filter_dictionary: filters += get_filter_rules(filter_dictionary, optional=True) - for key, values in exclude_dictionary.items(): - # Ignore multiple values for this field, e.g. {"id": ["ignorethis1", "ignorethis2"]} - filters += [f"{key}:!=[{', '.join(_escape_str(value) for value in values)}]"] + if exclude_dictionary: + for key, values in exclude_dictionary.items(): + # Use the documented "is not any of" operator: field:![val1, val2, ...] + # (not `!=[...]`, which is the single-value inequality operator with an array arg) + filters += [f"{key}:![{', '.join(_escape_str(value) for value in values)}]"] if filters: params["filter_by"] = " && ".join(filters)