Skip to content

Commit 50a5d8a

Browse files
committed
fix: case insensitive doi, publisher_name, keywds
1 parent a2cff18 commit 50a5d8a

File tree

2 files changed

+30
-2
lines changed

2 files changed

+30
-2
lines changed

components/renku_data_services/solr/entity_schema.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
FieldName,
1212
FieldType,
1313
Filters,
14+
ReplaceCommand,
1415
SchemaCommand,
1516
Tokenizers,
1617
TypeName,
@@ -79,6 +80,11 @@ class Analyzers:
7980
],
8081
)
8182

83+
keyword_case_insensitive: Final[Analyzer] = Analyzer(
84+
tokenizer=Tokenizers.keyword,
85+
filters=[Filters.lowercase],
86+
)
87+
8288

8389
class FieldTypes:
8490
"""A collection of field types."""
@@ -99,6 +105,12 @@ class FieldTypes:
99105
)
100106
date_time: Final[FieldType] = FieldType.date_time_point(TypeName("SearchDateTime"))
101107

108+
keyword: Final[FieldType] = (
109+
FieldType.text(TypeName("Keyword")).make_stored().with_analyzer(Analyzers.keyword_case_insensitive)
110+
)
111+
"""keyword is a field type that is not changed at all by the tokenizer, and is stored unchanged
112+
but is searched in case-insensitive manner. Note, analyzers cannot be added to StrField, so we use TextField."""
113+
102114

103115
initial_entity_schema: Final[list[SchemaCommand]] = [
104116
AddCommand(FieldTypes.id),
@@ -162,11 +174,19 @@ class FieldTypes:
162174
SchemaMigration(
163175
version=13,
164176
commands=[
165-
AddCommand(Field.of(Fields.doi, FieldTypes.string)),
177+
AddCommand(FieldTypes.keyword),
178+
AddCommand(Field.of(Fields.doi, FieldTypes.keyword)),
166179
AddCommand(CopyFieldRule(source=Fields.doi, dest=Fields.content_all)),
167-
AddCommand(Field.of(Fields.publisher_name, FieldTypes.string)),
180+
AddCommand(Field.of(Fields.publisher_name, FieldTypes.keyword)),
168181
AddCommand(CopyFieldRule(source=Fields.publisher_name, dest=Fields.content_all)),
169182
],
170183
requires_reindex=False,
171184
),
185+
SchemaMigration(
186+
version=14,
187+
commands=[
188+
ReplaceCommand(Field.of(Fields.keywords, FieldTypes.keyword).make_multi_valued()),
189+
],
190+
requires_reindex=True,
191+
),
172192
]

components/renku_data_services/solr/solr_schema.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ class Tokenizers:
4646
icu: Tokenizer = Tokenizer(name="icu")
4747
openNlp: Tokenizer = Tokenizer(name="openNlp")
4848

49+
# The keyword tokenizer treats the entire field as a single token
50+
# See https://solr.apache.org/guide/solr/latest/indexing-guide/tokenizers.html#keyword-tokenizer
51+
keyword: Tokenizer = Tokenizer(name="keyword")
52+
4953

5054
@final
5155
class Filter(BaseModel):
@@ -156,6 +160,10 @@ def with_index_analyzer(self, a: Analyzer) -> Self:
156160
"""Return a copy with index analyzers set to the given one."""
157161
return self.model_copy(update={"indexAnalyzer": a})
158162

163+
def make_stored(self) -> Self:
164+
"""Make the field "stored" so that original value of the field is stored and can be retrieved."""
165+
return self.model_copy(update={"stored": True})
166+
159167
@classmethod
160168
def id(cls, name: TypeName) -> FieldType:
161169
"""Create a field that can be used as a document id."""

0 commit comments

Comments
 (0)