fix: case insensitive doi, publisher_name, keywds

olevski · olevski · commit 50a5d8a304ba · 2025-12-12T16:53:47.000+01:00
diff --git a/components/renku_data_services/solr/entity_schema.py b/components/renku_data_services/solr/entity_schema.py
@@ -11,6 +11,7 @@
     FieldName,
     FieldType,
     Filters,
+    ReplaceCommand,
     SchemaCommand,
     Tokenizers,
     TypeName,
@@ -79,6 +80,11 @@ class Analyzers:
         ],
     )
 
+    keyword_case_insensitive: Final[Analyzer] = Analyzer(
+        tokenizer=Tokenizers.keyword,
+        filters=[Filters.lowercase],
+    )
+
 
 class FieldTypes:
     """A collection of field types."""
@@ -99,6 +105,12 @@ class FieldTypes:
     )
     date_time: Final[FieldType] = FieldType.date_time_point(TypeName("SearchDateTime"))
 
+    keyword: Final[FieldType] = (
+        FieldType.text(TypeName("Keyword")).make_stored().with_analyzer(Analyzers.keyword_case_insensitive)
+    )
+    """keyword is a field type that is not changed at all by the tokenizer, and is stored unchanged
+    but is searched in case-insensitive manner. Note, analyzers cannot be added to StrField, so we use TextField."""
+
 
 initial_entity_schema: Final[list[SchemaCommand]] = [
     AddCommand(FieldTypes.id),
@@ -162,11 +174,19 @@ class FieldTypes:
     SchemaMigration(
         version=13,
         commands=[
-            AddCommand(Field.of(Fields.doi, FieldTypes.string)),
+            AddCommand(FieldTypes.keyword),
+            AddCommand(Field.of(Fields.doi, FieldTypes.keyword)),
             AddCommand(CopyFieldRule(source=Fields.doi, dest=Fields.content_all)),
-            AddCommand(Field.of(Fields.publisher_name, FieldTypes.string)),
+            AddCommand(Field.of(Fields.publisher_name, FieldTypes.keyword)),
             AddCommand(CopyFieldRule(source=Fields.publisher_name, dest=Fields.content_all)),
         ],
         requires_reindex=False,
     ),
+    SchemaMigration(
+        version=14,
+        commands=[
+            ReplaceCommand(Field.of(Fields.keywords, FieldTypes.keyword).make_multi_valued()),
+        ],
+        requires_reindex=True,
+    ),
 ]
diff --git a/components/renku_data_services/solr/solr_schema.py b/components/renku_data_services/solr/solr_schema.py
@@ -46,6 +46,10 @@ class Tokenizers:
     icu: Tokenizer = Tokenizer(name="icu")
     openNlp: Tokenizer = Tokenizer(name="openNlp")
 
+    # The keyword tokenizer treats the entire field as a single token
+    # See https://solr.apache.org/guide/solr/latest/indexing-guide/tokenizers.html#keyword-tokenizer
+    keyword: Tokenizer = Tokenizer(name="keyword")
+
 
 @final
 class Filter(BaseModel):
@@ -156,6 +160,10 @@ def with_index_analyzer(self, a: Analyzer) -> Self:
         """Return a copy with index analyzers set to the given one."""
         return self.model_copy(update={"indexAnalyzer": a})
 
+    def make_stored(self) -> Self:
+        """Make the field "stored" so that original value of the field is stored and can be retrieved."""
+        return self.model_copy(update={"stored": True})
+
     @classmethod
     def id(cls, name: TypeName) -> FieldType:
         """Create a field that can be used as a document id."""