From 52b13aed6c7a634a066909da046a8a600922c186 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Thu, 16 Apr 2026 10:53:40 +0200 Subject: [PATCH 01/29] refactor(metadatakeys): merge duplicate metadata keys across datasets into a single metadata key with combined userGroups --- src/datasets/datasets.service.ts | 17 +-- src/instruments/instruments.service.ts | 18 ++- .../dto/create-metadata-key.dto.ts | 8 -- .../dto/output-metadata-key.dto.ts | 10 +- src/metadata-keys/metadatakeys.service.ts | 129 +++++++++++------- .../schemas/metadatakey.schema.ts | 22 ++- src/proposals/proposals.service.ts | 23 ++-- src/samples/samples.service.ts | 24 ++-- 8 files changed, 148 insertions(+), 103 deletions(-) diff --git a/src/datasets/datasets.service.ts b/src/datasets/datasets.service.ts index 7ca344075..696315bd1 100644 --- a/src/datasets/datasets.service.ts +++ b/src/datasets/datasets.service.ts @@ -102,10 +102,10 @@ export class DatasetsService { doc: UpdateQuery, ): MetadataSourceDoc { const source: MetadataSourceDoc = { - sourceType: "dataset", - sourceId: doc.pid, - ownerGroup: doc.owner, - accessGroups: doc.accessGroups || [], + sourceType: this.datasetModel.collection.name, + userGroups: Array.from( + new Set([doc.ownerGroup, ...(doc.accessGroups ?? [])].filter(Boolean)), + ), isPublished: doc.isPublished || false, metadata: doc.scientificMetadata ?? {}, }; @@ -501,6 +501,7 @@ export class DatasetsService { } await this.metadataKeysService.replaceManyFromSource( + this.createMetadataKeysInstance(existingDataset), this.createMetadataKeysInstance(updatedDataset), ); // we were able to find the dataset and update it @@ -551,6 +552,7 @@ export class DatasetsService { } await this.metadataKeysService.replaceManyFromSource( + this.createMetadataKeysInstance(existingDataset), this.createMetadataKeysInstance(patchedDataset), ); // we were able to find the dataset and update it @@ -580,10 +582,9 @@ export class DatasetsService { } // delete metadata keys associated with this dataset - await this.metadataKeysService.deleteMany({ - sourceId: id, - sourceType: "dataset", - }); + await this.metadataKeysService.deleteMany( + this.createMetadataKeysInstance(deletedDataset), + ); return deletedDataset; } diff --git a/src/instruments/instruments.service.ts b/src/instruments/instruments.service.ts index 921927dcc..6c0b430e6 100644 --- a/src/instruments/instruments.service.ts +++ b/src/instruments/instruments.service.ts @@ -32,10 +32,10 @@ export class InstrumentsService { doc: UpdateQuery, ): MetadataSourceDoc { const source: MetadataSourceDoc = { - sourceType: "instrument", - sourceId: doc.pid, - ownerGroup: doc.ownerGroup, - accessGroups: doc.accessGroups || [], + sourceType: this.instrumentModel.collection.name, + userGroups: Array.from( + new Set([doc.ownerGroup, ...(doc.accessGroups ?? [])].filter(Boolean)), + ), isPublished: doc.isPublished || false, metadata: doc.customMetadata ?? {}, }; @@ -96,6 +96,15 @@ export class InstrumentsService { updateInstrumentDto: PartialUpdateInstrumentDto, ): Promise { const username = (this.request.user as JWTUser).username; + const existingInstrument = await this.instrumentModel + .findOne(filter) + .exec(); + + if (!existingInstrument) { + throw new NotFoundException( + `Instrument not found with filter: ${JSON.stringify(filter)}`, + ); + } const updatedInstrument = this.instrumentModel .findOneAndUpdate( @@ -117,6 +126,7 @@ export class InstrumentsService { } await this.metadataKeysService.replaceManyFromSource( + this.createMetadataKeysInstance(existingInstrument), this.createMetadataKeysInstance(updatedInstrument), ); diff --git a/src/metadata-keys/dto/create-metadata-key.dto.ts b/src/metadata-keys/dto/create-metadata-key.dto.ts index 4e5ca5b2f..cd71b7365 100644 --- a/src/metadata-keys/dto/create-metadata-key.dto.ts +++ b/src/metadata-keys/dto/create-metadata-key.dto.ts @@ -11,12 +11,4 @@ export class CreateMetadataKeyDto extends UpdateMetadataKeyDto { }) @IsString() sourceType: string; - - @ApiProperty({ - type: String, - required: true, - description: "Unique identifier of the source item this key is linked to.", - }) - @IsString() - sourceId: string; } diff --git a/src/metadata-keys/dto/output-metadata-key.dto.ts b/src/metadata-keys/dto/output-metadata-key.dto.ts index ad561ddc3..bf242f8c6 100644 --- a/src/metadata-keys/dto/output-metadata-key.dto.ts +++ b/src/metadata-keys/dto/output-metadata-key.dto.ts @@ -3,6 +3,7 @@ import { ArrayNotEmpty, IsArray, IsBoolean, + IsNumber, IsOptional, IsString, } from "class-validator"; @@ -56,12 +57,13 @@ export class OutputMetadataKeyDto extends QueryableClass { sourceType: string; @ApiProperty({ - type: String, + type: Number, required: true, - description: "Unique identifier of the source item this key is linked to.", + description: + "Tracks how many sources are using this metadata key. Managed internally.", }) - @IsString() - sourceId: string; + @IsNumber() + usageCount: number; @ApiProperty({ type: Boolean, diff --git a/src/metadata-keys/metadatakeys.service.ts b/src/metadata-keys/metadatakeys.service.ts index fc1b06076..054c0fe7d 100644 --- a/src/metadata-keys/metadatakeys.service.ts +++ b/src/metadata-keys/metadatakeys.service.ts @@ -4,14 +4,7 @@ import { MetadataKeyClass, MetadataKeyDocument, } from "./schemas/metadatakey.schema"; -import { - DeleteResult, - FilterQuery, - HydratedDocument, - Model, - PipelineStage, - QueryOptions, -} from "mongoose"; +import { FilterQuery, Model, PipelineStage, QueryOptions } from "mongoose"; import { isEmpty } from "lodash"; import { addCreatedByFields, @@ -24,10 +17,8 @@ type ScientificMetadataEntry = { }; export type MetadataSourceDoc = { - sourceId: string; sourceType: string; - ownerGroup: string; - accessGroups: string[]; + userGroups: string[]; isPublished: boolean; metadata: Record; }; @@ -58,6 +49,7 @@ export class MetadataKeysService { }, }, ]; + if (!isEmpty(fieldsProjection)) { const projection = parsePipelineProjection(fieldsProjection); pipeline.push({ $project: projection }); @@ -79,59 +71,96 @@ export class MetadataKeysService { return data; } - async deleteMany( - filter: FilterQuery, - ): Promise { - const result = await this.metadataKeyModel.deleteMany(filter).exec(); + async deleteMany(doc: MetadataSourceDoc): Promise { + if (isEmpty(doc.metadata)) { + return; + } + const metadataKeys = Object.keys(doc.metadata); + const ids = metadataKeys.map((key) => `${doc.sourceType}_${key}`); + const filter = { id: { $in: ids } }; + + const decrementUsageOp = { + updateMany: { filter, update: { $inc: { usageCount: -1 } } }, + }; + const deleteUnusedOp = { + deleteMany: { filter: { ...filter, usageCount: { $lte: 0 } } }, + }; + + await this.metadataKeyModel.bulkWrite([decrementUsageOp, deleteUnusedOp]); Logger.log( - `MetadataKeys deleted: ${result.deletedCount ?? 0} With filter:`, - { filter }, + `Removed or decremented MetadataKeys usageCount from source ${doc.sourceType} for key(s): ${metadataKeys.join(", ")}`, ); - - return result; } - async insertManyFromSource( - doc: MetadataSourceDoc, - ): Promise[] | void> { + async insertManyFromSource(doc: MetadataSourceDoc): Promise { if (isEmpty(doc.metadata)) { return; } - const userGroups = Array.from( - new Set([doc.ownerGroup, ...(doc.accessGroups ?? [])].filter(Boolean)), - ) as string[]; - - const isPublished = doc.isPublished; - - const metadata = doc.metadata ?? {}; - - const docs = Object.entries(metadata).map(([key, entry]) => { - const createMetadataKeyDto = { - _id: `${doc.sourceType}_${doc.sourceId}_${key}`, - id: `${doc.sourceType}_${doc.sourceId}_${key}`, - sourceType: doc.sourceType, - sourceId: doc.sourceId, - key, - userGroups, - isPublished, - humanReadableName: (entry as ScientificMetadataEntry).human_name ?? "", + + const metadata = doc.metadata; + const userGroups = doc.userGroups; + + const upsertOps = Object.entries(metadata).map(([key, entry]) => { + const id = `${doc.sourceType}_${key}`; + const humanReadableName = + (entry as ScientificMetadataEntry).human_name ?? ""; + return { + updateOne: { + filter: { id }, + update: { + $setOnInsert: addCreatedByFields( + { + _id: id as MetadataKeyDocument["_id"], + id, + key, + sourceType: doc.sourceType, + humanReadableName, + }, + "system", + ), + ...(doc.isPublished && { $set: { isPublished: true } }), + $addToSet: { userGroups: { $each: userGroups } }, + $inc: { usageCount: 1 }, + }, + upsert: true, + }, }; - return addCreatedByFields(createMetadataKeyDto, "system"); }); + await this.metadataKeyModel.bulkWrite(upsertOps); + Logger.log( - `Created ${docs.length} MetadataKeys from source ${doc.sourceType} with ID ${doc.sourceId}`, + `Created or incremented MetadataKeys usageCount from source ${doc.sourceType} for key(s): ${Object.keys(metadata).join(", ")}`, ); - - return await this.metadataKeyModel.insertMany(docs); } - async replaceManyFromSource(doc: MetadataSourceDoc): Promise { - await this.deleteMany({ - sourceId: doc.sourceId, - sourceType: doc.sourceType, - }); - await this.insertManyFromSource(doc); + async replaceManyFromSource( + oldDoc: MetadataSourceDoc, + newDoc: MetadataSourceDoc, + ): Promise { + const oldKeys = Object.keys(oldDoc.metadata ?? {}); + const newKeys = Object.keys(newDoc.metadata ?? {}); + + const addedKeys = newKeys.filter((k) => !oldKeys.includes(k)); + const removedKeys = oldKeys.filter((k) => !newKeys.includes(k)); + + if (removedKeys.length > 0) { + await this.deleteMany({ + ...oldDoc, + metadata: Object.fromEntries( + removedKeys.map((k) => [k, oldDoc.metadata[k]]), + ), + }); + } + + if (addedKeys.length > 0) { + await this.insertManyFromSource({ + ...newDoc, + metadata: Object.fromEntries( + addedKeys.map((k) => [k, newDoc.metadata[k]]), + ), + }); + } } } diff --git a/src/metadata-keys/schemas/metadatakey.schema.ts b/src/metadata-keys/schemas/metadatakey.schema.ts index b4c6b6d39..d49b74ee8 100644 --- a/src/metadata-keys/schemas/metadatakey.schema.ts +++ b/src/metadata-keys/schemas/metadatakey.schema.ts @@ -75,25 +75,21 @@ export class MetadataKeyClass extends QueryableClass { }) sourceType: string; - @ApiProperty({ - type: String, - required: true, - description: "Unique identifier of the source item this key is linked to.", - }) - @Prop({ - type: String, - required: true, - index: true, - }) - sourceId: string; - @ApiProperty({ type: Boolean, required: true, description: "Flag is true when data are made publicly available.", }) - @Prop({ type: Boolean, required: true }) + @Prop({ type: Boolean, required: true, default: false }) isPublished: boolean; + + @ApiProperty({ + type: Number, + description: + "Tracks how many sources are using this metadata key. Managed internally.", + }) + @Prop({ type: Number, default: 0 }) + usageCount: number; } export const MetadataKeySchema = SchemaFactory.createForClass(MetadataKeyClass); diff --git a/src/proposals/proposals.service.ts b/src/proposals/proposals.service.ts index 76c80323f..01ae66b07 100644 --- a/src/proposals/proposals.service.ts +++ b/src/proposals/proposals.service.ts @@ -179,10 +179,10 @@ export class ProposalsService { doc: UpdateQuery, ): MetadataSourceDoc { const source: MetadataSourceDoc = { - sourceType: "proposal", - sourceId: doc.proposalId, - ownerGroup: doc.ownerGroup, - accessGroups: doc.accessGroups || [], + sourceType: this.proposalModel.collection.name, + userGroups: Array.from( + new Set([doc.ownerGroup, ...(doc.accessGroups ?? [])].filter(Boolean)), + ), isPublished: doc.isPublished || false, metadata: doc.metadata ?? {}, }; @@ -286,6 +286,13 @@ export class ProposalsService { updateProposalDto: PartialUpdateProposalDto, ): Promise { const username = (this.request.user as JWTUser).username; + const existingProposal = await this.proposalModel.findOne(filter).exec(); + + if (!existingProposal) { + throw new NotFoundException( + `Proposal not found with filter: ${JSON.stringify(filter)}`, + ); + } const updatedProposal = await this.proposalModel .findOneAndUpdate( @@ -309,6 +316,7 @@ export class ProposalsService { } await this.metadataKeysService.replaceManyFromSource( + this.createMetadataKeysInstance(existingProposal), this.createMetadataKeysInstance(updatedProposal), ); @@ -326,10 +334,9 @@ export class ProposalsService { ); } - this.metadataKeysService.deleteMany({ - sourceType: "proposal", - sourceId: deletedProposal.proposalId, - }); + await this.metadataKeysService.deleteMany( + this.createMetadataKeysInstance(deletedProposal), + ); return deletedProposal; } diff --git a/src/samples/samples.service.ts b/src/samples/samples.service.ts index 36cc88548..91e25cdf9 100644 --- a/src/samples/samples.service.ts +++ b/src/samples/samples.service.ts @@ -38,10 +38,10 @@ export class SamplesService { doc: UpdateQuery, ): MetadataSourceDoc { const source: MetadataSourceDoc = { - sourceType: "sample", - sourceId: doc.sampleId, - ownerGroup: doc.ownerGroup, - accessGroups: doc.accessGroups || [], + sourceType: this.sampleModel.collection.name, + userGroups: Array.from( + new Set([doc.ownerGroup, ...(doc.accessGroups ?? [])].filter(Boolean)), + ), isPublished: doc.isPublished || false, metadata: doc.sampleCharacteristics ?? {}, }; @@ -177,6 +177,14 @@ export class SamplesService { updateSampleDto: PartialUpdateSampleDto, ): Promise { const username = (this.request.user as JWTUser).username; + const existingSample = await this.sampleModel.findOne(filter).exec(); + + if (!existingSample) { + throw new NotFoundException( + `Sample not found with filter: ${JSON.stringify(filter)}`, + ); + } + const updateData = addUpdatedByField(updateSampleDto, username); const updateDataMongoose = { @@ -200,6 +208,7 @@ export class SamplesService { } await this.metadataKeysService.replaceManyFromSource( + this.createMetadataKeysInstance(existingSample), this.createMetadataKeysInstance(updatedSample), ); @@ -217,10 +226,9 @@ export class SamplesService { ); } - this.metadataKeysService.deleteMany({ - sourceType: "sample", - sourceId: deletedSample.sampleId, - }); + this.metadataKeysService.deleteMany( + this.createMetadataKeysInstance(deletedSample), + ); return deletedSample; } } From 90cc22f6345d75f499ca057ce112fddebd0eae9e Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 20 Apr 2026 20:40:49 +0200 Subject: [PATCH 02/29] refactor(metadatakeys): merge duplicate keys across datasets and track group references --- docs/developer-guide/metadatakeys-module.md | 165 +++- ...aset-scientificMetadata-to-metadatakeys.md | 311 ++++++++ ...aset-scientificMetadata-to-metadatakeys.js | 313 ++++++++ .../metadatakeys.service.spec.ts | 534 ++++++++++--- src/metadata-keys/metadatakeys.service.ts | 279 +++++-- .../schemas/metadatakey.schema.ts | 18 +- .../types/metadatakeys-filter-content.ts | 2 +- test/MetadataKeys.js | 706 ++++++++++++++---- test/TestData.js | 7 +- 9 files changed, 2018 insertions(+), 317 deletions(-) create mode 100644 docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md create mode 100644 migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js diff --git a/docs/developer-guide/metadatakeys-module.md b/docs/developer-guide/metadatakeys-module.md index f41dbe293..9440a1c08 100644 --- a/docs/developer-guide/metadatakeys-module.md +++ b/docs/developer-guide/metadatakeys-module.md @@ -17,74 +17,179 @@ The previous implementation in the Datasets service lacked a permission-based fi - Stability: Crashes occurred when retrieval limits were missing or improperly configured. - Risks: Users could see metadata keys they did not have permissions to access. +--- + ## Module Architecture This module consists of a dedicated Controller and Service layer that implements a robust permission-aware logic. ### MetadataKeysController -Provides the API interface for searching keys. Allowed filters can be found in `src/metadata-keys/metadatakeys.service.ts` and exmaple can be find in `src/metadata-keys/types/metadatakeys-filter-content.ts` +Provides the API interface for searching metadata keys. -- `Endpoint`: GET /metadatakeys (replaces /datasets/metadataKeys) -- `Method`: findAll -- `Endpoint Access`: Endpoint can be Accessed by any users +- **Endpoint**: `GET /metadatakeys` (replaces `GET /datasets/metadataKeys`) +- **Method**: `findAll` +- **Access**: Any authenticated user (permission filtering is applied server-side) +- Allowed filter fields: see `src/metadata-keys/types/metadatakeys-lookup.ts` +- Filter examples: see `src/metadata-keys/types/metadatakeys-filter-content.ts` + +--- ### MetadataKeysService -This handles the business logic and talks to the database. It is divided into user-facing search logic and internal data synchronization. +Handles business logic and database access. Split into two concerns: + +#### 1. User-facing search — `findAll` + +Applies CASL permission filters before querying: + +| User type | Visible keys | +| -------------------- | -------------------------------------------------------- | +| Admin | All keys in the system | +| Authenticated user | Keys where they belong to `ownerGroup` or `accessGroups` | +| Unauthenticated user | Keys marked `isPublished: true` | + +Results default to 100 per page if no limit is provided. + +#### 2. Internal synchronization + +These methods are called internally when source documents are created, updated, or deleted. They are never called directly from the controller. + +##### `insertManyFromSource(doc)` + +Called when a dataset is **created** or **gains new metadata keys**. + +For each key in `scientificMetadata`: + +- Upserts a `MetadataKey` document identified by `${sourceType}_${key}_${humanReadableName}` +- Increments `usageCount` (total datasets referencing this key) +- Increments per-group reference counts in `userGroupCounts` +- Adds new groups to the `userGroups` query array via `$addToSet` +- Sets `isPublished: true` if the source dataset is published (never unsets inline — the cronjob handles the `true → false` transition) + +##### `deleteMany(doc)` -#### Permission Layer (Applies to findAll only): +Called when a dataset is **deleted** or **loses metadata keys**. -When a user searches for keys, the service uses accessibleBy to automatically append access filters based on CASL permissions: +Runs three sequential steps: -- `Admins`: Can search and get all metadata keys in the system. -- `Authenticated Users`: Can only get keys where they are part of the ownerGroup or accessGroups. -- `Unauthenticated Users`: Can only get keys that are marked as isPublished. +1. Decrements `usageCount` and per-group counts in `userGroupCounts` +2. Recomputes the `userGroups` array from the updated counts — drops any group whose count reached zero +3. Deletes `MetadataKey` documents where `usageCount <= 0` + `usageCount` is the authoritative deletion signal. A dataset with no `userGroups` and `isPublished: false` would be invisible to both `userGroupCounts` and `isPublished` checks, so neither alone can substitute for it. -#### Service Methods: +##### `replaceManyFromSource(oldDoc, newDoc)` -- `findAll`: The only public-facing method. It applies the permission layer and then uses a database aggregation pipeline to find and return the specific keys requested by the user. Every search is limited to 100 results by default, if limit is not provided. -- `insertManyFromSource`: An internal method that takes an original document (like a Dataset), extracts fields from **scientificMetadata**, **metadata**, and **customMetadata**, and creates new records in the Metadata Keys collection. -- `deleteMany`: Removes metadata key entries associated with a source document when that document is deleted from the system. -- `replaceManyFromSource`: Triggered when a source document (e.g., a Dataset or Proposal) is updated. It calls `deleteMany` and `insertManyFromSource` sequentially. +Called when a dataset is **updated**. Diffs the old and new `scientificMetadata` to produce three disjoint key sets: -## Usage Example +| Set | Keys | Action | +| ------- | ---------------- | -------------------------------------------------------------------- | +| Added | Only in `newDoc` | `insertManyFromSource` | +| Removed | Only in `oldDoc` | `deleteMany` | +| Shared | In both | `updateSharedKeys` (group / isPublished / humanReadableName changes) | -To list all metadata keys associated with a dataset, the user must provide the sourceType and sourceId. If the fields array is provided, only those specific fields will be returned: +The three sets are disjoint by `_id` so they run in parallel via `Promise.all`. + +For shared keys, three things are handled independently: + +- **userGroups changed** — added groups are incremented, removed groups are decremented, then `userGroups` array is recomputed from the updated counts +- **isPublished flipped true** — sets `isPublished: true` inline; `false` is left to the cronjob +- **humanReadableName changed** — since `humanReadableName` is part of `_id`, this is treated as a delete of the old document + insert of a new one + +--- + +## Schema + +Each `MetadataKey` document has the following key fields: + +| Field | Type | Description | +| ------------------- | --------------------- | ---------------------------------------------------------------------------------------- | +| `_id` | `string` | Composite key: `${sourceType}_${key}_${humanReadableName}` | +| `key` | `string` | The raw metadata key name | +| `humanReadableName` | `string` | Human-readable label from `human_name`, empty string if absent | +| `sourceType` | `string` | Source collection: `Dataset`, `Proposal`, `Sample`, etc. | +| `userGroups` | `string[]` | Groups that can see this key — kept in sync with `userGroupCounts` for query performance | +| `userGroupCounts` | `Map` | Per-group reference counts — source of truth for safe group removal | +| `usageCount` | `number` | Total datasets referencing this key — authoritative deletion signal | +| `isPublished` | `boolean` | True if any contributing dataset is published | + +`userGroups` and `userGroupCounts` are intentionally redundant. `userGroupCounts` owns the truth and enables safe atomic decrements. `userGroups` is a denormalized array kept for query performance — MongoDB's multikey index on `userGroups` makes `{ userGroups: { $in: [...] } }` efficient in a way that querying Map keys directly is not. + +--- + +## Filter Examples + +List metadata keys visible to the current user for a given source type: ```json { "where": { - "sourceType": "dataset", - "sourceId": "datasetId" + "sourceType": "Dataset" }, - "fields": ["humanreadableName", "key"], + "fields": ["key", "humanReadableName"], "limits": { "limit": 10, "skip": 0, "sort": { - "createdAt": "asc | desc" + "createdAt": "desc" } } } ``` -To retrieve a specific metadata key, use the following filter: +Find a specific key by name: + +```json +{ + "where": { + "sourceType": "Dataset", + "key": "temperature" + }, + "limits": { + "limit": 1, + "skip": 0 + } +} +``` + +Partial search on `key`: ```json { "where": { - "sourceType": "dataset", - "sourceId": "datasetId", - "key": "metadata_key_name" + "sourceType": "Dataset", + "key": { "$regex": "temp", "$options": "i" } }, - "fields": ["key"], "limits": { "limit": 10, - "skip": 0, - "sort": { - "createdAt": "asc | desc" - } + "skip": 0 } } ``` + +Partial search on `humanReadableName`: + +```json +{ + "where": { + "sourceType": "Dataset", + "humanReadableName": { "$regex": "temp", "$options": "i" } + }, + "limits": { + "limit": 10, + "skip": 0 + } +} +``` + +--- + +## Initial Migration + +The `MetadataKeys` collection is populated by a migration script that must be run manually before the service is deployed for the first time. + +See: `migrations/20260417145401-sync-dataset-scientificMetadata-to-metadatakeys.js` + +Documentation: `migrations/20260417145401-sync-dataset-scientificMetadata-to-metadatakeys.md` + +> ⚠️ The application will start normally without the migration, but the MetadataKeys service will return empty results until it is run. diff --git a/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md b/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md new file mode 100644 index 000000000..b966d6388 --- /dev/null +++ b/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md @@ -0,0 +1,311 @@ +# 20260417145401 — Sync Dataset scientificMetadata to MetadataKeys + +## What this migration does + +Rebuilds the `MetadataKeys` collection from scratch by scanning all `Dataset` documents and extracting every key found in `scientificMetadata`. + +Each unique `(sourceType, key, humanReadableName)` combination becomes one `MetadataKey` document. If the same key appears across multiple datasets, their `userGroups` and counts are merged into a single entry. + +--- + +## Why it exists + +The `MetadataKeys` collection powers metadata key search and access control. This is the initial population of the collection — it must be run once before the service can operate. + +Each `MetadataKey` document tracks: + +- `userGroupCounts: Map` — how many datasets per group reference this key, enabling safe atomic group removal when a dataset is updated or deleted +- `usageCount: number` — total datasets referencing this key regardless of groups, used as the authoritative deletion signal + +--- + +## Source data shape + +```js +// Dataset document +{ + ..., + _id: "uuid-A", + ownerGroup: "group-1", // mandatory + accessGroups: ["group-2"], // optional + isPublished: true, + scientificMetadata: { + temperature: { value: 100, unit: "C", human_name: "Temperature" }, + pressure: { value: 1, unit: "bar" }, // no human_name + } +} +``` + +--- + +## Output shape + +```js +// MetadataKey document +{ + _id: "Dataset_temperature_Temperature", // ${sourceType}_${key}_${humanReadableName} + key: "temperature", + humanReadableName: "Temperature", + sourceType: "Dataset", + isPublished: true, + usageCount: 2, + userGroups: ["group-1", "group-2"], + userGroupCounts: { "group-1": 2, "group-2": 1 }, + createdBy: "migration", + createdAt: ISODate("...") +} +``` + +--- + +## Pipeline walkthrough + +The pipeline uses two datasets as a running example throughout: + +| | ownerGroup | accessGroups | isPublished | scientificMetadata keys | +| --------- | ---------- | ------------ | ----------- | ----------------------- | +| Dataset A | group-1 | [group-2] | true | temperature, pressure | +| Dataset B | group-1 | [] | false | temperature | + +--- + +### Stage 1 — Filter + +Keep only documents where `scientificMetadata` exists and is an object. + +``` +Input: all Dataset documents +Output: documents where scientificMetadata exists +``` + +--- + +### Stage 2 — Flatten scientificMetadata + +Expose `_id` as `datasetId` and convert `scientificMetadata` from an object to an array of `{k, v}` pairs using `$objectToArray`. + +```js +// Input +{ + _id: "uuid-A", + ownerGroup: "group-1", + accessGroups: ["group-2"], + isPublished: true, + scientificMetadata: { + temperature: { value: 100, unit: "C", human_name: "Temperature" }, + pressure: { value: 1, unit: "bar" }, + } +} + +// Output +{ + datasetId: "uuid-A", + ownerGroup: "group-1", + accessGroups: ["group-2"], + isPublished: true, + metaArr: [ + { k: "temperature", v: { value: 100, unit: "C", human_name: "Temperature" } }, + { k: "pressure", v: { value: 1, unit: "bar" } }, + ] +} +``` + +--- + +### Stage 3 — Unwind metaArr + +One document per `(dataset, metadata key)`. + +``` +Input: 1 document (Dataset A) with metaArr of length 2 +Output: 2 documents + +{ datasetId: "uuid-A", ..., metaArr: { k: "temperature", v: { human_name: "Temperature", ... } } } +{ datasetId: "uuid-A", ..., metaArr: { k: "pressure", v: { ... } } } +``` + +--- + +### Stage 4 — Shape each (dataset, key) document + +Extract `key`, `humanReadableName`, and compute `userGroups` as the union of `ownerGroup` and `accessGroups`. + +```js +// Input (one of the two documents from stage 3) +{ datasetId: "uuid-A", ownerGroup: "group-1", accessGroups: ["group-2"], isPublished: true, metaArr: { k: "temperature", v: { human_name: "Temperature" } } } + +// Output (all documents after both datasets are processed) +{ datasetId: "uuid-A", key: "temperature", isPublished: true, humanReadableName: "Temperature", userGroups: ["group-1", "group-2"] } +{ datasetId: "uuid-A", key: "pressure", isPublished: true, humanReadableName: "", userGroups: ["group-1", "group-2"] } +{ datasetId: "uuid-B", key: "temperature", isPublished: false, humanReadableName: "Temperature", userGroups: ["group-1"] } +``` + +> `ownerGroup` is mandatory field so no null fallback is needed. `accessGroups` is optional so it falls back to `[]`. + +--- + +### Stage 5 — Unwind userGroups + +One document per `(dataset, key, group)`. This is the pivot that makes per-group counting possible. + +`preserveNullAndEmptyArrays: true` keeps datasets with no groups so `usageCount` stays accurate. + +``` +Input +{ datasetId: "uuid-A", key: "temperature", humanReadableName: "Temperature", isPublished: true, userGroups: ["group-1", "group-2"] } + +Output +{ datasetId: "uuid-A", key: "temperature", humanReadableName: "Temperature", isPublished: true, userGroups: "group-1" } +{ datasetId: "uuid-A", key: "temperature", humanReadableName: "Temperature", isPublished: true, userGroups: "group-2" } +{ datasetId: "uuid-A", key: "pressure", humanReadableName: "", isPublished: true, userGroups: "group-1" } +{ datasetId: "uuid-A", key: "pressure", humanReadableName: "", isPublished: true, userGroups: "group-2" } +{ datasetId: "uuid-B", key: "temperature", humanReadableName: "Temperature", isPublished: false, userGroups: "group-1" } +``` + +--- + +### Stage 6 — Filter null userGroups + +Drop documents where `userGroups` is `null`. This only occurs when `preserveNullAndEmptyArrays` retains a document from a dataset that had no groups at all. + +``` +Input: stream from stage 5, some may have userGroups: null +Output: only documents where userGroups is a real group name +``` + +--- + +### Stage 7 — Group by (metaKeyId, group) + +Each bucket is one unique `(key, humanReadableName, group)` combination. +`groupCount` = how many datasets with this group reference this key. + +``` +Input (5 documents from stage 6) +{ datasetId: "uuid-A", key: "temperature", humanReadableName: "Temperature", isPublished: true, userGroups: "group-1" } +{ datasetId: "uuid-A", key: "temperature", humanReadableName: "Temperature", isPublished: true, userGroups: "group-2" } +{ datasetId: "uuid-A", key: "pressure", humanReadableName: "", isPublished: true, userGroups: "group-1" } +{ datasetId: "uuid-A", key: "pressure", humanReadableName: "", isPublished: true, userGroups: "group-2" } +{ datasetId: "uuid-B", key: "temperature", humanReadableName: "Temperature", isPublished: false, userGroups: "group-1" } + +Output (4 buckets — one per unique metaKeyId + group) +{ _id: { metaKeyId: "Dataset_temperature_Temperature", group: "group-1" }, key: "temperature", humanReadableName: "Temperature", isPublished: true, groupCount: 2 } +{ _id: { metaKeyId: "Dataset_temperature_Temperature", group: "group-2" }, key: "temperature", humanReadableName: "Temperature", isPublished: true, groupCount: 1 } +{ _id: { metaKeyId: "Dataset_pressure_", group: "group-1" }, key: "pressure", humanReadableName: "", isPublished: true, groupCount: 1 } +{ _id: { metaKeyId: "Dataset_pressure_", group: "group-2" }, key: "pressure", humanReadableName: "", isPublished: true, groupCount: 1 } +``` + +> `isPublished: $max` means the field is `true` if **any** contributing dataset is published. + +--- + +### Stage 8 — Group by metaKeyId + +Reassemble one document per metadata key by collecting the per-group buckets from stage 7. + +``` +Input (4 buckets from stage 7) +{ _id: { metaKeyId: "Dataset_temperature_Temperature", group: "group-1" }, groupCount: 2, ... } +{ _id: { metaKeyId: "Dataset_temperature_Temperature", group: "group-2" }, groupCount: 1, ... } +{ _id: { metaKeyId: "Dataset_pressure_", group: "group-1" }, groupCount: 1, ... } +{ _id: { metaKeyId: "Dataset_pressure_", group: "group-2" }, groupCount: 1, ... } + +Output (2 documents — one per unique key) +{ + _id: "Dataset_temperature_Temperature", + key: "temperature", humanReadableName: "Temperature", isPublished: true, + userGroups: ["group-1", "group-2"], + userGroupCountsArr: [{ k: "group-1", v: 2 }, { k: "group-2", v: 1 }], + datasetIdSets: [["uuid-A", "uuid-B"], ["uuid-A"]] +} +{ + _id: "Dataset_pressure_", + key: "pressure", humanReadableName: "", isPublished: true, + userGroups: ["group-1", "group-2"], + userGroupCountsArr: [{ k: "group-1", v: 1 }, { k: "group-2", v: 1 }], + datasetIdSets: [["uuid-A"], ["uuid-A"]] +} +``` + +--- + +### Stage 9 — Final projection + +- `userGroupCounts`: converts `[{k, v}]` array to a plain object (Mongoose stores this as a `Map`) +- `usageCount`: unions all per-group `datasetId` sets and counts distinct IDs + +``` +// usageCount for temperature: +// datasetIdSets = [["uuid-A", "uuid-B"], ["uuid-A"]] +// union = ["uuid-A", "uuid-B"] +// count = 2 +``` + +```js +// Output (final document written to MetadataKeys) +{ + _id: "Dataset_temperature_Temperature", + key: "temperature", + sourceType: "Dataset", + humanReadableName: "Temperature", + isPublished: true, + userGroups: ["group-1", "group-2"], + userGroupCounts: { "group-1": 2, "group-2": 1 }, + usageCount: 2, + createdBy: "migration", + createdAt: ISODate("...") +} +``` + +--- + +### Stage 10 — Merge into MetadataKeys + +- `whenNotMatched: insert` — new documents are inserted as-is +- `whenMatched` — additively merges counts when two `SOURCE_COLLECTIONS` produce the same `_id` + +> This is not triggered today since `sourceType` is part of `_id`, making collisions between collections impossible. It is kept correct for when `Proposal`, `Sample`, or other collections are added as sources. + +```js +// whenMatched userGroupCounts example +// existing: { "group-1": 3, "group-2": 1 } +// incoming: { "group-1": 2, "group-3": 5 } +// result: { "group-1": 5, "group-2": 1, "group-3": 5 } +``` + +--- + +## Running the migration + +```bash +# Run manually in production — ideally during low-traffic hours. +# The migration is slow but non-blocking: the app continues to serve +# requests while it runs. However, MetadataKeys will be unavailable +# for the duration since the collection is wiped at the start. +npm run migrate:db:up + +# verify +db.MetadataKeys.countDocuments({ userGroupCounts: { $exists: true } }) +# should equal +db.MetadataKeys.countDocuments() +``` + +> ⚠️ **Do not interrupt.** The migration wipes `MetadataKeys` at the start with `deleteMany`. If interrupted, re-run `migrate:db:up` — the wipe ensures a clean slate on retry. + +--- + +## Rollback + +```bash +npm run migrate:db:down +``` + +Wipes the entire `MetadataKeys` collection. The collection will be repopulated on the next `migrate:db:up`. + +Verify the rollback succeeded by checking the migration status: + +```bash +npm run migrate:db:status +``` + +If the migration shows as `pending` it means the rollback was successful and the migration has not been run yet. diff --git a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js new file mode 100644 index 000000000..904b48886 --- /dev/null +++ b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js @@ -0,0 +1,313 @@ +const SOURCE_COLLECTIONS = ["Dataset"]; + +function buildPipeline(sourceType) { + return [ + // ------------------------------------------------------------------------- + // Stage 1: Only process documents that have scientificMetadata + // ------------------------------------------------------------------------- + { + $match: { + scientificMetadata: { $exists: true, $type: "object" }, + }, + }, + + // ------------------------------------------------------------------------- + // Stage 2: Flatten scientificMetadata into an array of key/value pairs. + // Preserve _id as datasetId so we can count unique datasets later. + // ------------------------------------------------------------------------- + { + $project: { + datasetId: "$_id", + ownerGroup: 1, + accessGroups: 1, + isPublished: 1, + metaArr: { $objectToArray: "$scientificMetadata" }, + }, + }, + + // ------------------------------------------------------------------------- + // Stage 3: One document per (dataset, metadata key) + // ------------------------------------------------------------------------- + { $unwind: "$metaArr" }, + + // ------------------------------------------------------------------------- + // Stage 4: Shape each (dataset, key) document. + // userGroups is the union of ownerGroup + accessGroups for this dataset. + // ------------------------------------------------------------------------- + { + $project: { + datasetId: 1, + key: "$metaArr.k", + isPublished: 1, + humanReadableName: { $ifNull: ["$metaArr.v.human_name", ""] }, + userGroups: { + $setUnion: [["$ownerGroup"], { $ifNull: ["$accessGroups", []] }], + }, + }, + }, + + // ------------------------------------------------------------------------- + // Stage 5: One document per (dataset, key, group). + // This is the pivot that makes per-group counting possible. + // preserveNullAndEmptyArrays keeps datasets with no groups so + // usageCount stays accurate even for group-less datasets. + // ------------------------------------------------------------------------- + { + $unwind: { + path: "$userGroups", + preserveNullAndEmptyArrays: true, + }, + }, + + // ------------------------------------------------------------------------- + // Stage 6: Filter out the empty-string sentinel that comes from ownerGroup + // being null/missing. Real group names are never empty strings. + // ------------------------------------------------------------------------- + { + $match: { + userGroups: { $nin: [null, ""] }, + }, + }, + + // ------------------------------------------------------------------------- + // Stage 7: Group by (metaKeyId, group). + // groupCount = how many datasets with this group use this key. + // datasetIds = set of distinct dataset IDs (for accurate usageCount). + // ------------------------------------------------------------------------- + { + $group: { + _id: { + metaKeyId: { + $concat: [`${sourceType}_`, "$key", "_", "$humanReadableName"], + }, + group: "$userGroups", + }, + key: { $first: "$key" }, + humanReadableName: { $first: "$humanReadableName" }, + isPublished: { $max: "$isPublished" }, + groupCount: { $sum: 1 }, + datasetIds: { $addToSet: "$datasetId" }, + }, + }, + + // ------------------------------------------------------------------------- + // Stage 8: Group by metaKeyId to reassemble one document per metadata key. + // userGroupCountsArr will become the userGroupCounts Map. + // datasetIdSets is a list of per-group dataset ID sets — merged in the + // next stage to compute total unique dataset count (usageCount). + // ------------------------------------------------------------------------- + { + $group: { + _id: "$_id.metaKeyId", + key: { $first: "$key" }, + humanReadableName: { $first: "$humanReadableName" }, + isPublished: { $max: "$isPublished" }, + userGroups: { $push: "$_id.group" }, + userGroupCountsArr: { + $push: { k: "$_id.group", v: "$groupCount" }, + }, + datasetIdSets: { $push: "$datasetIds" }, + }, + }, + + // ------------------------------------------------------------------------- + // Stage 9: Final projection. + // userGroupCounts: [{k,v}] array → plain object (stored as Map in Mongoose). + // usageCount: union all per-group datasetId sets, count distinct IDs. + // ------------------------------------------------------------------------- + { + $project: { + _id: 1, + key: 1, + sourceType: { $literal: sourceType }, + humanReadableName: 1, + isPublished: 1, + userGroups: 1, + userGroupCounts: { $arrayToObject: "$userGroupCountsArr" }, + usageCount: { + $size: { + $reduce: { + input: "$datasetIdSets", + initialValue: [], + in: { $setUnion: ["$$value", "$$this"] }, + }, + }, + }, + createdBy: { $literal: "migration" }, + createdAt: { $toDate: "$$NOW" }, + }, + }, + + // ------------------------------------------------------------------------- + // Stage 10: Merge into MetadataKeys. + // whenMatched handles the (future) case where multiple SOURCE_COLLECTIONS + // produce the same _id. Not possible today since sourceType is part of + // the _id, but kept correct for when more collections are added. + // ------------------------------------------------------------------------- + { + $merge: { + into: "MetadataKeys", + on: "_id", + whenMatched: [ + { + $replaceWith: { + $mergeObjects: [ + "$$new", + { + // Preserve original audit fields + createdAt: { $ifNull: ["$createdAt", "$$new.createdAt"] }, + createdBy: { $ifNull: ["$createdBy", "$$new.createdBy"] }, + updatedBy: { $literal: "migration" }, + updatedAt: { $toDate: "$$NOW" }, + + // Merge group arrays + userGroups: { + $setUnion: ["$userGroups", "$$new.userGroups"], + }, + + // Additively merge userGroupCounts — sum counts per group + // across both the existing and incoming documents. + userGroupCounts: { + $arrayToObject: { + $map: { + input: { + $setUnion: [ + { + $map: { + input: { $objectToArray: "$userGroupCounts" }, + as: "e", + in: "$$e.k", + }, + }, + { + $map: { + input: { + $objectToArray: "$$new.userGroupCounts", + }, + as: "e", + in: "$$e.k", + }, + }, + ], + }, + as: "group", + in: { + k: "$$group", + v: { + $add: [ + { + $ifNull: [ + { + $getField: { + field: "$$group", + input: "$userGroupCounts", + }, + }, + 0, + ], + }, + { + $ifNull: [ + { + $getField: { + field: "$$group", + input: "$$new.userGroupCounts", + }, + }, + 0, + ], + }, + ], + }, + }, + }, + }, + }, + + // Any source being published makes the key published + isPublished: { $or: ["$isPublished", "$$new.isPublished"] }, + + // Add incoming dataset count to existing + usageCount: { $add: ["$usageCount", "$$new.usageCount"] }, + }, + ], + }, + }, + ], + whenNotMatched: "insert", + }, + }, + ]; +} + +module.exports = { + async up(db) { + const start = Date.now(); + const elapsed = () => `${((Date.now() - start) / 1000).toFixed(1)}s`; + + // Wipe MetadataKeys collection first to ensure a clean state + const deleted = await db.collection("MetadataKeys").deleteMany({}); + console.log( + `[${elapsed()}] Cleared ${deleted.deletedCount} existing MetadataKeys`, + ); + + for (const collection of SOURCE_COLLECTIONS) { + const total = await db.collection(collection).countDocuments({ + scientificMetadata: { $exists: true, $type: "object" }, + }); + + if (total === 0) { + console.log( + `[${elapsed()}] No documents with scientificMetadata in ${collection}, skipping...`, + ); + continue; + } + + console.log( + `[${elapsed()}] Processing ${total.toLocaleString()} documents from ${collection}...`, + ); + + const timer = setInterval( + () => + console.log( + `[${elapsed()}] ${collection} migration still running...`, + ), + 5000, + ); + + try { + await db + .collection(collection) + .aggregate(buildPipeline(collection), { + allowDiskUse: true, + maxTimeMS: 0, + }) + .toArray(); + } finally { + clearInterval(timer); + } + + console.log(`[${elapsed()}] ✅ ${collection} done`); + } + + const result = await db.collection("MetadataKeys").countDocuments(); + console.log( + `[${elapsed()}] Migration completed — Total MetadataKeys: ${result.toLocaleString()}`, + ); + }, + + async down(db) { + const start = Date.now(); + const elapsed = () => `${((Date.now() - start) / 1000).toFixed(1)}s`; + + const total = await db.collection("MetadataKeys").countDocuments(); + console.log( + `[${elapsed()}] Deleting ${total.toLocaleString()} MetadataKeys...`, + ); + + const deleted = await db.collection("MetadataKeys").deleteMany({}); + console.log( + `[${elapsed()}] Rollback completed — Deleted ${deleted.deletedCount} MetadataKeys`, + ); + }, +}; diff --git a/src/metadata-keys/metadatakeys.service.spec.ts b/src/metadata-keys/metadatakeys.service.spec.ts index 152b96727..7f66ca359 100644 --- a/src/metadata-keys/metadatakeys.service.spec.ts +++ b/src/metadata-keys/metadatakeys.service.spec.ts @@ -1,141 +1,505 @@ -import { Logger } from "@nestjs/common"; import { getModelToken } from "@nestjs/mongoose"; import { Test, TestingModule } from "@nestjs/testing"; import { MetadataKeysService, MetadataSourceDoc } from "./metadatakeys.service"; import { MetadataKeyClass } from "./schemas/metadatakey.schema"; -class MetadataKeyModelMock { - aggregate = jest.fn().mockReturnValue({ - exec: jest.fn().mockResolvedValue([{ key: "k1" }]), - }); - deleteMany = jest.fn().mockReturnValue({ - exec: jest.fn().mockResolvedValue({ deletedCount: 2 }), - }); - insertMany = jest.fn().mockReturnValue([{ _id: "id1" }]); -} +const modelMock = { + aggregate: jest + .fn() + .mockReturnValue({ exec: jest.fn().mockResolvedValue([]) }), + bulkWrite: jest.fn().mockResolvedValue({}), + updateMany: jest.fn().mockResolvedValue({}), + deleteMany: jest.fn().mockResolvedValue({}), +}; + +const BASE_DOC: MetadataSourceDoc = { + sourceType: "Dataset", + userGroups: ["group-1", "group-2"], + isPublished: false, + metadata: { + temperature: { human_name: "Temperature" }, + pressure: {}, + }, +}; describe("MetadataKeysService", () => { let service: MetadataKeysService; - let model: MetadataKeyModelMock; beforeEach(async () => { + jest.clearAllMocks(); + const module: TestingModule = await Test.createTestingModule({ providers: [ MetadataKeysService, { provide: getModelToken(MetadataKeyClass.name), - useClass: MetadataKeyModelMock, + useValue: modelMock, }, ], }).compile(); service = await module.resolve(MetadataKeysService); - model = module.get( - getModelToken(MetadataKeyClass.name), - ) as unknown as MetadataKeyModelMock; }); it("should be defined", () => { expect(service).toBeDefined(); }); - it("findAll builds aggregation pipeline and executes it", async () => { - const filter = { - where: { sourceType: "dataset" }, - fields: ["key"], - limits: { limit: 10, skip: 0, sort: { createdAt: "asc" } }, - }; + // ------------------------------------------------------------------------- + // findAll + // ------------------------------------------------------------------------- - const accessFilter = { userGroups: { $in: ["ess"] } }; + describe("findAll", () => { + it("builds a pipeline with match, project, sort, skip, limit", async () => { + const accessFilter = { userGroups: { $in: ["group-1"] } }; + const filter = { + where: { sourceType: "Dataset" }, + fields: ["key", "humanReadableName"], + limits: { limit: 10, skip: 5, sort: { createdAt: "asc" } }, + }; - const res = await service.findAll(filter, accessFilter); + await service.findAll(filter, accessFilter); - expect(model.aggregate).toHaveBeenCalledTimes(1); + const [pipeline] = modelMock.aggregate.mock.calls[0]; + expect(pipeline[0]).toEqual({ + $match: { $and: [accessFilter, filter.where] }, + }); + expect(pipeline).toEqual( + expect.arrayContaining([ + expect.objectContaining({ $project: expect.any(Object) }), + expect.objectContaining({ $sort: expect.any(Object) }), + expect.objectContaining({ $skip: 5 }), + expect.objectContaining({ $limit: 10 }), + ]), + ); + }); + + it("applies default limits when none provided", async () => { + await service.findAll({}, {}); - const pipeline = model.aggregate.mock.calls[0][0]; - expect(pipeline[0]).toEqual({ - $match: { $and: [accessFilter, filter.where] }, + const [pipeline] = modelMock.aggregate.mock.calls[0]; + expect(pipeline).toEqual( + expect.arrayContaining([ + expect.objectContaining({ $skip: 0 }), + expect.objectContaining({ $limit: 100 }), + ]), + ); }); - expect(res).toEqual([{ key: "k1" }]); + it("omits $project stage when no fields specified", async () => { + await service.findAll({}, {}); + + const [pipeline] = modelMock.aggregate.mock.calls[0]; + const hasProject = pipeline.some((s: object) => "$project" in s); + expect(hasProject).toBe(false); + }); + + it("omits $sort stage when no sort specified", async () => { + await service.findAll({ limits: { limit: 10, skip: 0 } }, {}); + + const [pipeline] = modelMock.aggregate.mock.calls[0]; + const hasSort = pipeline.some((s: object) => "$sort" in s); + expect(hasSort).toBe(false); + }); + + it("returns aggregation results", async () => { + const mockData = [{ key: "temperature" }, { key: "pressure" }]; + modelMock.aggregate.mockReturnValueOnce({ + exec: jest.fn().mockResolvedValue(mockData), + }); + + const result = await service.findAll({}, {}); + + expect(result).toEqual(mockData); + }); }); - it("deleteMany deletes and logs result", async () => { - const logSpy = jest.spyOn(Logger, "log").mockImplementation(); + // ------------------------------------------------------------------------- + // insertManyFromSource + // ------------------------------------------------------------------------- - const filter = { sourceType: "dataset", sourceId: "sid" }; - const result = await service.deleteMany(filter); + describe("insertManyFromSource", () => { + it("does nothing when metadata is empty", async () => { + await service.insertManyFromSource({ ...BASE_DOC, metadata: {} }); + + expect(modelMock.bulkWrite).not.toHaveBeenCalled(); + }); + + it("calls bulkWrite with one updateOne per metadata key", async () => { + await service.insertManyFromSource(BASE_DOC); + + expect(modelMock.bulkWrite).toHaveBeenCalledTimes(1); + const [ops] = modelMock.bulkWrite.mock.calls[0]; + expect(ops).toHaveLength(2); + expect(ops.every((op: object) => "updateOne" in op)).toBe(true); + }); - expect(model.deleteMany).toHaveBeenCalledWith(filter); - expect(result.deletedCount).toBe(2); - expect(logSpy).toHaveBeenCalled(); + it("builds correct _id from sourceType + key + humanReadableName", async () => { + await service.insertManyFromSource(BASE_DOC); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + const ids = ops.map( + (op: { updateOne: { filter: { _id: string } } }) => + op.updateOne.filter._id, + ); + + expect(ids).toContain("Dataset_temperature_Temperature"); + expect(ids).toContain("Dataset_pressure_"); // no human_name + }); + + it("sets upsert: true on every operation", async () => { + await service.insertManyFromSource(BASE_DOC); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + expect( + ops.every( + (op: { updateOne: { upsert: boolean } }) => + op.updateOne.upsert === true, + ), + ).toBe(true); + }); + + it("increments usageCount and per-group counts", async () => { + await service.insertManyFromSource(BASE_DOC); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + const { $inc } = ops[0].updateOne.update; + + expect($inc.usageCount).toBe(1); + expect($inc["userGroupCounts.group-1"]).toBe(1); + expect($inc["userGroupCounts.group-2"]).toBe(1); + }); + + it("adds userGroups via $addToSet", async () => { + await service.insertManyFromSource(BASE_DOC); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + const { $addToSet } = ops[0].updateOne.update; + + expect($addToSet.userGroups.$each).toEqual(["group-1", "group-2"]); + }); + + it("sets isPublished when dataset is published", async () => { + await service.insertManyFromSource({ ...BASE_DOC, isPublished: true }); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + expect(ops[0].updateOne.update.$set.isPublished).toBe(true); + }); + + it("does not set isPublished when dataset is not published", async () => { + await service.insertManyFromSource({ ...BASE_DOC, isPublished: false }); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + expect(ops[0].updateOne.update.$set.isPublished).toBeUndefined(); + }); + + it("uses $setOnInsert for immutable fields", async () => { + await service.insertManyFromSource(BASE_DOC); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + const { $setOnInsert } = ops[0].updateOne.update; + + expect($setOnInsert).toMatchObject({ + key: expect.any(String), + sourceType: "Dataset", + humanReadableName: expect.any(String), + }); + }); }); - it("insertManyFromSource does nothing when metadata is empty", async () => { - const doc: MetadataSourceDoc = { - sourceId: "sid", - sourceType: "dataset", - ownerGroup: "ess", - accessGroups: ["ess"], - isPublished: false, - metadata: {}, - }; + // ------------------------------------------------------------------------- + // deleteMany + // ------------------------------------------------------------------------- + + describe("deleteMany", () => { + it("does nothing when metadata is empty", async () => { + await service.deleteMany({ ...BASE_DOC, metadata: {} }); + + expect(modelMock.updateMany).not.toHaveBeenCalled(); + expect(modelMock.deleteMany).not.toHaveBeenCalled(); + }); + + it("runs three operations in order: decrement → recompute → delete", async () => { + const callOrder: string[] = []; + modelMock.updateMany.mockImplementation(() => { + callOrder.push("updateMany"); + return Promise.resolve({}); + }); + modelMock.deleteMany.mockImplementation(() => { + callOrder.push("deleteMany"); + return Promise.resolve({}); + }); + + await service.deleteMany(BASE_DOC); + + expect(callOrder).toEqual(["updateMany", "updateMany", "deleteMany"]); + }); + + it("decrements usageCount and per-group counts", async () => { + await service.deleteMany(BASE_DOC); + + const [, firstUpdate] = modelMock.updateMany.mock.calls[0]; + expect(firstUpdate.$inc.usageCount).toBe(-1); + expect(firstUpdate.$inc["userGroupCounts.group-1"]).toBe(-1); + expect(firstUpdate.$inc["userGroupCounts.group-2"]).toBe(-1); + }); - const res = await service.insertManyFromSource(doc); + it("targets correct _ids based on metadata keys and humanReadableName", async () => { + await service.deleteMany(BASE_DOC); - expect(res).toBeUndefined(); - expect(model.insertMany).not.toHaveBeenCalled(); + const [firstFilter] = modelMock.updateMany.mock.calls[0]; + expect(firstFilter._id.$in).toContain("Dataset_temperature_Temperature"); + expect(firstFilter._id.$in).toContain("Dataset_pressure_"); + }); + + it("deletes documents where usageCount <= 0", async () => { + await service.deleteMany(BASE_DOC); + + const [deleteFilter] = modelMock.deleteMany.mock.calls[0]; + expect(deleteFilter.usageCount).toEqual({ $lte: 0 }); + }); + + it("recompute stage uses $set with $objectToArray on userGroupCounts", async () => { + await service.deleteMany(BASE_DOC); + + // second updateMany call is the RECOMPUTE_USER_GROUPS_STAGE + const [, recomputeStage] = modelMock.updateMany.mock.calls[1]; + expect(recomputeStage[0].$set.userGroups).toBeDefined(); + }); }); - it("insertManyFromSource creates metadata keys", async () => { - const doc: MetadataSourceDoc = { - sourceId: "sid", - sourceType: "dataset", - ownerGroup: "ess", - accessGroups: ["swap", "ess"], - isPublished: false, - metadata: { - key1: { human_name: "Key One" }, - key2: {}, - }, - }; + // ------------------------------------------------------------------------- + // replaceManyFromSource + // ------------------------------------------------------------------------- + + describe("replaceManyFromSource", () => { + it("calls insertManyFromSource for keys only in newDoc", async () => { + const insertSpy = jest + .spyOn(service, "insertManyFromSource") + .mockResolvedValue(); + + const oldDoc = { + ...BASE_DOC, + metadata: { temperature: { human_name: "Temperature" } }, + }; + const newDoc = { + ...BASE_DOC, + metadata: { + temperature: { human_name: "Temperature" }, + wavelength: {}, + }, + }; + + await service.replaceManyFromSource(oldDoc, newDoc); + + expect(insertSpy).toHaveBeenCalledWith( + expect.objectContaining({ metadata: { wavelength: {} } }), + ); + }); - const res = await service.insertManyFromSource(doc); + it("calls deleteMany for keys only in oldDoc", async () => { + const deleteSpy = jest.spyOn(service, "deleteMany").mockResolvedValue(); - expect(model.insertMany).toHaveBeenCalledTimes(1); + const oldDoc = { + ...BASE_DOC, + metadata: { temperature: { human_name: "Temperature" }, pressure: {} }, + }; + const newDoc = { + ...BASE_DOC, + metadata: { temperature: { human_name: "Temperature" } }, + }; - const insertedDocs = model.insertMany.mock.calls[0][0]; - expect(insertedDocs).toHaveLength(2); + await service.replaceManyFromSource(oldDoc, newDoc); - expect(insertedDocs[0]).toMatchObject({ - key: "key1", - humanReadableName: "Key One", - sourceType: "dataset", - sourceId: "sid", + expect(deleteSpy).toHaveBeenCalledWith( + expect.objectContaining({ metadata: { pressure: {} } }), + ); }); - expect(res).toEqual([{ _id: "id1" }]); + it("does not call deleteMany or insertManyFromSource for shared unchanged keys", async () => { + const deleteSpy = jest.spyOn(service, "deleteMany").mockResolvedValue(); + const insertSpy = jest + .spyOn(service, "insertManyFromSource") + .mockResolvedValue(); + + const doc = { + ...BASE_DOC, + metadata: { temperature: { human_name: "Temperature" } }, + }; + + await service.replaceManyFromSource(doc, doc); + + expect(deleteSpy).not.toHaveBeenCalled(); + expect(insertSpy).not.toHaveBeenCalled(); + }); + + it("handles all three buckets simultaneously", async () => { + const deleteSpy = jest.spyOn(service, "deleteMany").mockResolvedValue(); + const insertSpy = jest + .spyOn(service, "insertManyFromSource") + .mockResolvedValue(); + + const oldDoc = { + ...BASE_DOC, + metadata: { temperature: { human_name: "Temperature" }, pressure: {} }, + }; + const newDoc = { + ...BASE_DOC, + metadata: { + temperature: { human_name: "Temperature" }, + wavelength: {}, + }, + }; + + await service.replaceManyFromSource(oldDoc, newDoc); + + // pressure removed → deleteMany + expect(deleteSpy).toHaveBeenCalledWith( + expect.objectContaining({ metadata: { pressure: {} } }), + ); + // wavelength added → insertManyFromSource + expect(insertSpy).toHaveBeenCalledWith( + expect.objectContaining({ metadata: { wavelength: {} } }), + ); + }); + + it("does nothing when both docs have empty metadata", async () => { + const deleteSpy = jest.spyOn(service, "deleteMany").mockResolvedValue(); + const insertSpy = jest + .spyOn(service, "insertManyFromSource") + .mockResolvedValue(); + + const empty = { ...BASE_DOC, metadata: {} }; + await service.replaceManyFromSource(empty, empty); + + expect(deleteSpy).not.toHaveBeenCalled(); + expect(insertSpy).not.toHaveBeenCalled(); + }); }); - it("replaceManyFromSource deletes then inserts", async () => { - const doc: MetadataSourceDoc = { - sourceId: "sid", - sourceType: "dataset", - ownerGroup: "ess", - accessGroups: ["swap"], - isPublished: false, - metadata: { key1: {} }, - }; + // ------------------------------------------------------------------------- + // updateSharedKeys (tested via replaceManyFromSource) + // ------------------------------------------------------------------------- + + describe("updateSharedKeys (via replaceManyFromSource)", () => { + it("increments added groups and decrements removed groups", async () => { + const oldDoc: MetadataSourceDoc = { + ...BASE_DOC, + userGroups: ["group-1"], + metadata: { temperature: { human_name: "Temperature" } }, + }; + const newDoc: MetadataSourceDoc = { + ...BASE_DOC, + userGroups: ["group-2"], + metadata: { temperature: { human_name: "Temperature" } }, + }; + + await service.replaceManyFromSource(oldDoc, newDoc); + + const [, update] = modelMock.updateMany.mock.calls[0]; + expect(update.$inc["userGroupCounts.group-2"]).toBe(1); + expect(update.$inc["userGroupCounts.group-1"]).toBe(-1); + }); + + it("runs RECOMPUTE_USER_GROUPS_STAGE when groups are removed", async () => { + const oldDoc: MetadataSourceDoc = { + ...BASE_DOC, + userGroups: ["group-1", "group-2"], + metadata: { temperature: { human_name: "Temperature" } }, + }; + const newDoc: MetadataSourceDoc = { + ...BASE_DOC, + userGroups: ["group-1"], + metadata: { temperature: { human_name: "Temperature" } }, + }; + + await service.replaceManyFromSource(oldDoc, newDoc); + + // two updateMany calls: one for the $inc, one for RECOMPUTE + expect(modelMock.updateMany).toHaveBeenCalledTimes(2); + }); + + it("does not run RECOMPUTE_USER_GROUPS_STAGE when only groups are added", async () => { + const oldDoc: MetadataSourceDoc = { + ...BASE_DOC, + userGroups: ["group-1"], + metadata: { temperature: { human_name: "Temperature" } }, + }; + const newDoc: MetadataSourceDoc = { + ...BASE_DOC, + userGroups: ["group-1", "group-2"], + metadata: { temperature: { human_name: "Temperature" } }, + }; + + await service.replaceManyFromSource(oldDoc, newDoc); + + // only one updateMany — no RECOMPUTE needed when nothing is removed + expect(modelMock.updateMany).toHaveBeenCalledTimes(1); + }); + + it("sets isPublished when it flips from false to true", async () => { + const oldDoc: MetadataSourceDoc = { + ...BASE_DOC, + isPublished: false, + metadata: { temperature: { human_name: "Temperature" } }, + }; + const newDoc: MetadataSourceDoc = { + ...BASE_DOC, + isPublished: true, + metadata: { temperature: { human_name: "Temperature" } }, + }; + + await service.replaceManyFromSource(oldDoc, newDoc); + + const [, update] = modelMock.updateMany.mock.calls[0]; + expect(update.$set.isPublished).toBe(true); + }); + + it("does not touch isPublished when it flips from true to false", async () => { + const oldDoc: MetadataSourceDoc = { + ...BASE_DOC, + isPublished: true, + metadata: { temperature: { human_name: "Temperature" } }, + }; + const newDoc: MetadataSourceDoc = { + ...BASE_DOC, + isPublished: false, + metadata: { temperature: { human_name: "Temperature" } }, + }; + + await service.replaceManyFromSource(oldDoc, newDoc); + + // no group changes and no publishedFlippedOn → updateMany not called at all + expect(modelMock.updateMany).not.toHaveBeenCalled(); + }); + + it("treats humanReadableName change as delete + insert", async () => { + const deleteSpy = jest.spyOn(service, "deleteMany").mockResolvedValue(); + const insertSpy = jest + .spyOn(service, "insertManyFromSource") + .mockResolvedValue(); - const deleteSpy = jest.spyOn(service, "deleteMany"); - const insertSpy = jest.spyOn(service, "insertManyFromSource"); + const oldDoc: MetadataSourceDoc = { + ...BASE_DOC, + metadata: { temperature: { human_name: "Temperature" } }, + }; + const newDoc: MetadataSourceDoc = { + ...BASE_DOC, + metadata: { temperature: { human_name: "Temp (renamed)" } }, + }; - await service.replaceManyFromSource(doc); + await service.replaceManyFromSource(oldDoc, newDoc); - expect(deleteSpy).toHaveBeenCalledWith({ - sourceId: "sid", - sourceType: "dataset", + expect(deleteSpy).toHaveBeenCalledWith( + expect.objectContaining({ + metadata: { temperature: { human_name: "Temperature" } }, + }), + ); + expect(insertSpy).toHaveBeenCalledWith( + expect.objectContaining({ + metadata: { temperature: { human_name: "Temp (renamed)" } }, + }), + ); }); - expect(insertSpy).toHaveBeenCalledWith(doc); }); }); diff --git a/src/metadata-keys/metadatakeys.service.ts b/src/metadata-keys/metadatakeys.service.ts index 054c0fe7d..0a2b84a63 100644 --- a/src/metadata-keys/metadatakeys.service.ts +++ b/src/metadata-keys/metadatakeys.service.ts @@ -23,6 +23,27 @@ export type MetadataSourceDoc = { metadata: Record; }; +// Recomputes the userGroups string array from userGroupCounts after a decrement. +// Retains only group names whose reference count is still above zero. +const RECOMPUTE_USER_GROUPS_STAGE = [ + { + $set: { + userGroups: { + $map: { + input: { + $filter: { + input: { $objectToArray: "$userGroupCounts" }, + cond: { $gt: ["$$this.v", 0] }, + }, + }, + as: "entry", + in: "$$entry.k", + }, + }, + }, + }, +]; + @Injectable({ scope: Scope.REQUEST }) export class MetadataKeysService { constructor( @@ -71,70 +92,116 @@ export class MetadataKeysService { return data; } - async deleteMany(doc: MetadataSourceDoc): Promise { - if (isEmpty(doc.metadata)) { - return; - } - const metadataKeys = Object.keys(doc.metadata); - const ids = metadataKeys.map((key) => `${doc.sourceType}_${key}`); - const filter = { id: { $in: ids } }; - - const decrementUsageOp = { - updateMany: { filter, update: { $inc: { usageCount: -1 } } }, - }; - const deleteUnusedOp = { - deleteMany: { filter: { ...filter, usageCount: { $lte: 0 } } }, - }; - - await this.metadataKeyModel.bulkWrite([decrementUsageOp, deleteUnusedOp]); - - Logger.log( - `Removed or decremented MetadataKeys usageCount from source ${doc.sourceType} for key(s): ${metadataKeys.join(", ")}`, - ); - } - + /** + * Called when a dataset is created or gains new metadata keys. + * + * For each key: + * - Creates a new MetadataKey entry if none exists (upsert) + * - Increments usageCount and per-group reference counts + * - Adds new groups to the userGroups query array + * - Sets isPublished if the source dataset is published (never unsets inline) + * - Updates humanReadableName to the latest value + */ async insertManyFromSource(doc: MetadataSourceDoc): Promise { - if (isEmpty(doc.metadata)) { - return; - } + if (isEmpty(doc.metadata)) return; - const metadata = doc.metadata; - const userGroups = doc.userGroups; + const { sourceType, userGroups, isPublished, metadata } = doc; - const upsertOps = Object.entries(metadata).map(([key, entry]) => { - const id = `${doc.sourceType}_${key}`; + const ops = Object.entries(metadata).map(([key, entry]) => { const humanReadableName = (entry as ScientificMetadataEntry).human_name ?? ""; + const id = this.buildId(sourceType, key, humanReadableName); + return { updateOne: { - filter: { id }, + filter: { _id: id }, update: { $setOnInsert: addCreatedByFields( { - _id: id as MetadataKeyDocument["_id"], - id, + _id: id as unknown as MetadataKeyDocument["_id"], key, - sourceType: doc.sourceType, + sourceType, humanReadableName, }, "system", ), - ...(doc.isPublished && { $set: { isPublished: true } }), + $set: { + // Only ever set to true inline. The false case is handled + // by the reconciliation cronjob since it transitions rarely. + ...(isPublished && { isPublished: true }), + }, $addToSet: { userGroups: { $each: userGroups } }, - $inc: { usageCount: 1 }, + $inc: { + usageCount: 1, + ...this.groupCountDeltas(userGroups, 1), + }, }, upsert: true, }, }; }); - await this.metadataKeyModel.bulkWrite(upsertOps); + await this.metadataKeyModel.bulkWrite(ops); Logger.log( - `Created or incremented MetadataKeys usageCount from source ${doc.sourceType} for key(s): ${Object.keys(metadata).join(", ")}`, + `Upserted MetadataKeys for ${sourceType}: ${Object.keys(metadata).join(", ")}`, ); } + /** + * Called when a dataset is deleted or loses metadata keys. + * + * Three-step process to ensure groups are safely removed: + * 1. Decrement usageCount and per-group reference counts + * 2. Recompute userGroups array from the updated counts + * (drops any group whose count reached zero) + * 3. Delete entries no longer referenced by any dataset + * + * usageCount is the authoritative deletion signal because a dataset + * with no userGroups and isPublished: false would be invisible to + * both userGroupCounts and isPublished checks. + */ + async deleteMany(doc: MetadataSourceDoc): Promise { + if (isEmpty(doc.metadata)) return; + + const { sourceType, userGroups, metadata } = doc; + const keys = Object.keys(metadata); + const ids = keys.map((key) => { + const humanReadableName = + (metadata[key] as ScientificMetadataEntry)?.human_name ?? ""; + return this.buildId(sourceType, key, humanReadableName); + }); + const filter = { _id: { $in: ids } }; + + await this.metadataKeyModel.updateMany(filter, { + $inc: { + usageCount: -1, + ...this.groupCountDeltas(userGroups, -1), + }, + }); + + await this.metadataKeyModel.updateMany(filter, RECOMPUTE_USER_GROUPS_STAGE); + + await this.metadataKeyModel.deleteMany({ + ...filter, + usageCount: { $lte: 0 }, + }); + + Logger.log( + `Decremented or deleted MetadataKeys for ${sourceType}: ${keys.join(", ")}`, + ); + } + /** + * Called when a dataset is updated. + * + * Diffs the old and new metadata to produce three disjoint key sets: + * - added: only in newDoc → insertManyFromSource + * - removed: only in oldDoc → deleteMany + * - shared: in both → updateSharedKeys (handles group / isPublished + * / humanReadableName changes) + * + * The three sets are disjoint by _id so Promise.all is safe. + */ async replaceManyFromSource( oldDoc: MetadataSourceDoc, newDoc: MetadataSourceDoc, @@ -144,23 +211,135 @@ export class MetadataKeysService { const addedKeys = newKeys.filter((k) => !oldKeys.includes(k)); const removedKeys = oldKeys.filter((k) => !newKeys.includes(k)); + const sharedKeys = newKeys.filter((k) => oldKeys.includes(k)); + + await Promise.all([ + removedKeys.length > 0 && + this.deleteMany({ + ...oldDoc, + metadata: Object.fromEntries( + removedKeys.map((k) => [k, oldDoc.metadata[k]]), + ), + }), + + addedKeys.length > 0 && + this.insertManyFromSource({ + ...newDoc, + metadata: Object.fromEntries( + addedKeys.map((k) => [k, newDoc.metadata[k]]), + ), + }), + + sharedKeys.length > 0 && + this.updateSharedKeys(sharedKeys, oldDoc, newDoc), + ]); + } + + private buildId( + sourceType: string, + key: string, + humanReadableName: string, + ): string { + return `${sourceType}_${key}_${humanReadableName}`; + } - if (removedKeys.length > 0) { - await this.deleteMany({ - ...oldDoc, - metadata: Object.fromEntries( - removedKeys.map((k) => [k, oldDoc.metadata[k]]), - ), + /** + * Builds $inc paths for per-group reference counts. + * + * groupCountDeltas(["groupA", "groupB"], 1) + * => { "userGroupCounts.groupA": 1, "userGroupCounts.groupB": 1 } + * + * MongoDB's dot-notation $inc creates the key if missing and increments + * if it already exists, making this safe for both first-time and + * subsequent upserts. + */ + private groupCountDeltas( + groups: string[], + delta: 1 | -1, + ): Record { + return Object.fromEntries( + groups.map((g) => [`userGroupCounts.${g}`, delta]), + ); + } + + /** + * Handles keys present in both the old and new dataset version. + * + * Checks three things that may have changed independently: + * 1. userGroups — increments added groups, decrements removed groups, + * then recomputes the userGroups array from the updated counts + * 2. isPublished — only sets true inline; false is left to the cronjob + * 3. humanReadableName — updated per-key where it differs + */ + private async updateSharedKeys( + sharedKeys: string[], + oldDoc: MetadataSourceDoc, + newDoc: MetadataSourceDoc, + ): Promise { + const { sourceType } = newDoc; + const ids = sharedKeys.map((k) => { + const humanReadableName = + (newDoc.metadata[k] as ScientificMetadataEntry)?.human_name ?? ""; + return this.buildId(sourceType, k, humanReadableName); + }); + const filter = { _id: { $in: ids } }; + + const addedGroups = newDoc.userGroups.filter( + (g) => !oldDoc.userGroups.includes(g), + ); + const removedGroups = oldDoc.userGroups.filter( + (g) => !newDoc.userGroups.includes(g), + ); + const publishedFlippedOn = !oldDoc.isPublished && newDoc.isPublished; + const hasGroupChanges = addedGroups.length > 0 || removedGroups.length > 0; + + if (hasGroupChanges || publishedFlippedOn) { + await this.metadataKeyModel.updateMany(filter, { + ...(addedGroups.length > 0 && { + $addToSet: { userGroups: { $each: addedGroups } }, + }), + $inc: { + ...(addedGroups.length > 0 && this.groupCountDeltas(addedGroups, 1)), + ...(removedGroups.length > 0 && + this.groupCountDeltas(removedGroups, -1)), + }, + ...(publishedFlippedOn && { $set: { isPublished: true } }), }); + + // Recompute userGroups to drop groups whose count reached zero + if (removedGroups.length > 0) { + await this.metadataKeyModel.updateMany( + filter, + RECOMPUTE_USER_GROUPS_STAGE, + ); + } } - if (addedKeys.length > 0) { - await this.insertManyFromSource({ - ...newDoc, - metadata: Object.fromEntries( - addedKeys.map((k) => [k, newDoc.metadata[k]]), - ), - }); + // humanReadableName is part of _id, so a change means a different document. + // Treat it as delete the old entry + insert the new one. + const humanNameChangedKeys = sharedKeys.filter((k) => { + const oldName = + (oldDoc.metadata[k] as ScientificMetadataEntry)?.human_name ?? ""; + const newName = + (newDoc.metadata[k] as ScientificMetadataEntry)?.human_name ?? ""; + return oldName !== newName; + }); + + if (humanNameChangedKeys.length > 0) { + await Promise.all([ + this.deleteMany({ + ...oldDoc, + metadata: Object.fromEntries( + humanNameChangedKeys.map((k) => [k, oldDoc.metadata[k]]), + ), + }), + this.insertManyFromSource({ + ...newDoc, + metadata: Object.fromEntries( + humanNameChangedKeys.map((k) => [k, newDoc.metadata[k]]), + ), + }), + ]); } } } diff --git a/src/metadata-keys/schemas/metadatakey.schema.ts b/src/metadata-keys/schemas/metadatakey.schema.ts index d49b74ee8..bbf602d32 100644 --- a/src/metadata-keys/schemas/metadatakey.schema.ts +++ b/src/metadata-keys/schemas/metadatakey.schema.ts @@ -90,10 +90,26 @@ export class MetadataKeyClass extends QueryableClass { }) @Prop({ type: Number, default: 0 }) usageCount: number; + + @ApiProperty({ + type: Object, + description: + "Tracks how many datasets per user group reference this metadata key. " + + "Used to safely remove groups from userGroups when the last dataset " + + "contributing that group is deleted or updated. " + + "e.g. { 'groupA': 3, 'groupB': 1 } means 3 datasets with groupA and 1 with groupB use this key.", + }) + @Prop({ type: Map, of: Number, default: {} }) + userGroupCounts: Map; } export const MetadataKeySchema = SchemaFactory.createForClass(MetadataKeyClass); -MetadataKeySchema.index({ sourceType: 1, sourceId: 1 }); +MetadataKeySchema.index({ sourceType: 1, isPublished: 1, key: 1 }); +MetadataKeySchema.index({ + sourceType: 1, + isPublished: 1, + humanReadableName: 1, +}); MetadataKeySchema.index({ sourceType: 1, userGroups: 1, key: 1 }); MetadataKeySchema.index({ sourceType: 1, userGroups: 1, humanReadableName: 1 }); diff --git a/src/metadata-keys/types/metadatakeys-filter-content.ts b/src/metadata-keys/types/metadatakeys-filter-content.ts index b7a38afab..e9804e0b5 100644 --- a/src/metadata-keys/types/metadatakeys-filter-content.ts +++ b/src/metadata-keys/types/metadatakeys-filter-content.ts @@ -9,8 +9,8 @@ const FILTERS: Record<"limits" | "fields" | "where" | "include", object> = { type: "object", example: { sourceType: "dataset", - sourceId: "datasetId", key: "metadata_key_name", + humanReadableName: "Metadata Key Name", }, }, include: {}, diff --git a/test/MetadataKeys.js b/test/MetadataKeys.js index c62e117f0..08b6f1e2a 100644 --- a/test/MetadataKeys.js +++ b/test/MetadataKeys.js @@ -2,173 +2,583 @@ const utils = require("./LoginUtils"); const { TestData } = require("./TestData"); -let accessTokenAdminIngestor = null; +let accessTokenAdmin = null; +let accessTokenArchiveManager = null; let accessTokenUser1 = null; -let datasetIdPrivate = null; -let datasetIdUser1 = null; -describe("MetadataKeys v4 ACL", () => { - before(async () => { - db.collection("MetadataKeys").deleteMany({}); +let pidGroup1 = null; // ownerGroup: group1, not published +let pidGroup2 = null; // ownerGroup: group2, not published +let pidPublished = null; // ownerGroup: group2, isPublished: true + +// --------------------------------------------------------------------------- +// Dataset fixtures — controlled ownerGroup for predictable access control +// --------------------------------------------------------------------------- +const datasetGroup1 = { + ...TestData.DatasetWithScientificMetadataV4, + ownerGroup: "group1", + accessGroups: [], + isPublished: false, +}; + +const datasetGroup2 = { + ...TestData.DatasetWithScientificMetadataV4, + ownerGroup: "group2", + accessGroups: ["group2"], + isPublished: false, + // extra key exclusive to group2 so access control tests are unambiguous + scientificMetadata: { + ...TestData.DatasetWithScientificMetadataV4.scientificMetadata, + group2_only_key: { value: "exclusive to group2" }, + }, +}; + +const datasetPublished = { + ...TestData.DatasetWithScientificMetadataV4, + ownerGroup: "group2", + accessGroups: ["group2"], + isPublished: true, +}; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- +async function getMetadataKeys(filter, token) { + const req = request(appUrl) + .get( + `/api/v4/metadatakeys?filter=${encodeURIComponent(JSON.stringify(filter))}`, + ) + .set("Accept", "application/json"); + + if (token) { + req.set({ Authorization: `Bearer ${token}` }); + } + + return req + .expect(TestData.SuccessfulGetStatusCode) + .expect("Content-Type", /json/); +} + +async function createDataset(dataset) { + const res = await request(appUrl) + .post("/api/v3/Datasets") + .send(dataset) + .set("Accept", "application/json") + .set({ Authorization: `Bearer ${accessTokenAdmin}` }) + .expect(TestData.EntryCreatedStatusCode); + return res.body.pid; +} + +async function patchDataset(pid, payload) { + return request(appUrl) + .patch("/api/v4/datasets/" + encodeURIComponent(pid)) + .send(payload) + .set("Accept", "application/json") + .set({ Authorization: `Bearer ${accessTokenAdmin}` }) + .expect(TestData.SuccessfulGetStatusCode); +} - accessTokenAdminIngestor = await utils.getToken(appUrl, { +async function deleteDataset(pid) { + return request(appUrl) + .delete("/api/v3/Datasets/" + encodeURIComponent(pid)) + .set("Accept", "application/json") + .set({ Authorization: `Bearer ${accessTokenArchiveManager}` }) + .expect(TestData.SuccessfulDeleteStatusCode); +} + +describe("2000: MetadataKeys: Access Control and Search", () => { + before(async () => { + accessTokenAdmin = await utils.getToken(appUrl, { username: "adminIngestor", password: TestData.Accounts["adminIngestor"]["password"], }); + accessTokenArchiveManager = await utils.getToken(appUrl, { + username: "archiveManager", + password: TestData.Accounts["archiveManager"]["password"], + }); + accessTokenUser1 = await utils.getToken(appUrl, { username: "user1", password: TestData.Accounts["user1"]["password"], }); + + pidGroup1 = await createDataset(datasetGroup1); + pidGroup2 = await createDataset(datasetGroup2); + pidPublished = await createDataset(datasetPublished); }); - it("0000: create a private dataset v3 has scientific metadata for admin", async () => { - return request(appUrl) - .post("/api/v3/Datasets") - .send(TestData.RawCorrectRandom) - .set("Accept", "application/json") - .set({ Authorization: `Bearer ${accessTokenAdminIngestor}` }) - .expect("Content-Type", /json/) - .expect(TestData.EntryCreatedStatusCode) - .then((res) => { - res.body.should.have - .property("datasetName") - .and.equal(TestData.RawCorrectRandom.datasetName); - - datasetIdPrivate = encodeURIComponent(res.body["pid"]); - }); - }); - - it("0001: create a public dataset v4 has scientific metadata for unauthenticated user", async () => { - const publicDataset = { ...TestData.RawCorrectV4, isPublished: true }; - return request(appUrl) - .post("/api/v4/Datasets") - .send(publicDataset) - .set("Accept", "application/json") - .set({ Authorization: `Bearer ${accessTokenAdminIngestor}` }) - .expect("Content-Type", /json/) - .expect(TestData.EntryCreatedStatusCode) - .then((res) => { - res.body.should.have - .property("datasetName") - .and.equal(publicDataset.datasetName); - }); - }); - - it("0002: create a private dataset v4 has scientific metadata for user1", async () => { - const user1Dataset = { ...TestData.RawCorrectV4, accessGroups: ["group1"] }; - return request(appUrl) - .post("/api/v4/Datasets") - .send(user1Dataset) - .set("Accept", "application/json") - .set({ Authorization: `Bearer ${accessTokenAdminIngestor}` }) - .expect("Content-Type", /json/) - .expect(TestData.EntryCreatedStatusCode) - .then((res) => { - res.body.should.have - .property("datasetName") - .and.equal(user1Dataset.datasetName); - - datasetIdUser1 = encodeURIComponent(res.body["pid"]); - }); - }); - - it("0010: should allow admin to list all metadata keys", async () => { - const filter = { - limits: { limit: 10, skip: 0, sort: { createdAt: "desc" } }, - }; + after(async () => { + for (const pid of [pidGroup1, pidGroup2, pidPublished]) { + if (pid) await deleteDataset(pid); + } + }); + + // ------------------------------------------------------------------------- + // Basic access + // ------------------------------------------------------------------------- + + it("0100: admin can fetch metadata keys and gets results from all groups", async () => { + const res = await getMetadataKeys( + { where: { sourceType: "Dataset" } }, + accessTokenAdmin, + ); + + res.body.should.be.an("array").and.have.length.greaterThan(0); + + const allGroups = res.body.flatMap((k) => k.userGroups); + allGroups.should.include("group1"); + allGroups.should.include("group2"); + }); + + it("0110: user1 can fetch metadata keys and only sees group1 keys", async () => { + const res = await getMetadataKeys( + { where: { sourceType: "Dataset" } }, + accessTokenUser1, + ); + + res.body.should.be.an("array").and.have.length.greaterThan(0); + res.body.forEach((k) => { + k.userGroups.should.include("group1"); + }); + }); + + it("0120: user1 cannot see keys exclusive to group2", async () => { + const res = await getMetadataKeys( + { where: { sourceType: "Dataset", key: "group2_only_key" } }, + accessTokenUser1, + ); + + res.body.should.be.an("array").and.have.lengthOf(0); + }); + + it("0130: unauthenticated user can only see published metadata keys", async () => { + const res = await getMetadataKeys( + { where: { sourceType: "Dataset" } }, + null, + ); + + res.body.should.be.an("array"); + res.body.forEach((k) => { + k.isPublished.should.equal(true); + }); + }); + + it("0140: unauthenticated user cannot see unpublished keys", async () => { + const res = await getMetadataKeys( + { where: { sourceType: "Dataset", isPublished: false } }, + null, + ); + + res.body.should.be.an("array").and.have.lengthOf(0); + }); + + // ------------------------------------------------------------------------- + // Search — key + // ------------------------------------------------------------------------- + + it("0200: admin can search keys by exact key name", async () => { + const res = await getMetadataKeys( + { where: { sourceType: "Dataset", key: "with_number" } }, + accessTokenAdmin, + ); + + res.body.should.be.an("array"); + res.body.forEach((k) => k.key.should.equal("with_number")); + }); + + it("0210: admin can search keys by partial key name using $regex", async () => { + const res = await getMetadataKeys( + { + where: { + sourceType: "Dataset", + key: { $regex: "with_", $options: "i" }, + }, + }, + accessTokenAdmin, + ); + + res.body.should.be.an("array").and.have.length.greaterThan(0); + res.body.forEach((k) => k.key.should.match(/with_/i)); + }); + + it("0220: regex search returns no results for non-matching pattern", async () => { + const res = await getMetadataKeys( + { + where: { + sourceType: "Dataset", + key: { $regex: "nonexistent_xyz", $options: "i" }, + }, + }, + accessTokenAdmin, + ); + + res.body.should.be.an("array").and.have.lengthOf(0); + }); + + it("0230: user1 regex search only returns keys from accessible groups", async () => { + const res = await getMetadataKeys( + { + where: { + sourceType: "Dataset", + key: { $regex: "with_", $options: "i" }, + }, + }, + accessTokenUser1, + ); + + res.body.should.be.an("array"); + res.body.forEach((k) => k.userGroups.should.include("group1")); + }); + + // ------------------------------------------------------------------------- + // Search — humanReadableName + // ------------------------------------------------------------------------- + + it("0240: admin can search by humanReadableName using $regex", async () => { + const res = await getMetadataKeys( + { + where: { + sourceType: "Dataset", + humanReadableName: { $regex: "pressure", $options: "i" }, + }, + }, + accessTokenAdmin, + ); + + res.body.should.be.an("array").and.have.length.greaterThan(0); + res.body.forEach((k) => k.humanReadableName.should.match(/pressure/i)); + }); + + it("0250: humanReadableName regex search returns empty for no match", async () => { + const res = await getMetadataKeys( + { + where: { + sourceType: "Dataset", + humanReadableName: { $regex: "nonexistent_xyz", $options: "i" }, + }, + }, + accessTokenAdmin, + ); + + res.body.should.be.an("array").and.have.lengthOf(0); + }); + + it("0260: keys without human_name have empty humanReadableName", async () => { + const res = await getMetadataKeys( + { where: { sourceType: "Dataset", key: "with_key_value" } }, + accessTokenAdmin, + ); + + res.body.should.be.an("array").and.have.length.greaterThan(0); + res.body[0].humanReadableName.should.equal(""); + }); + + // ------------------------------------------------------------------------- + // Pagination + // ------------------------------------------------------------------------- + + it("0300: limit is respected", async () => { + const res = await getMetadataKeys( + { where: { sourceType: "Dataset" }, limits: { limit: 2, skip: 0 } }, + accessTokenAdmin, + ); + + res.body.should.be.an("array").and.have.length.at.most(2); + }); + + it("0310: skip is respected", async () => { + const resAll = await getMetadataKeys( + { where: { sourceType: "Dataset" }, limits: { limit: 100, skip: 0 } }, + accessTokenAdmin, + ); + const resSkipped = await getMetadataKeys( + { where: { sourceType: "Dataset" }, limits: { limit: 100, skip: 1 } }, + accessTokenAdmin, + ); + + resSkipped.body.length.should.equal(resAll.body.length - 1); + }); - return request(appUrl) - .get( - `/api/v4/metadatakeys?filter=${encodeURIComponent(JSON.stringify(filter))}`, - ) - .set("Accept", "application/json") - .set({ Authorization: `Bearer ${accessTokenAdminIngestor}` }) - .expect(TestData.SuccessfulGetStatusCode) - .expect("Content-Type", /json/) - .then((res) => { - res.body.should.be.an("array").that.satisfies((arr) => { - const values = arr.map((item) => item.isPublished); - return values.includes(true) && values.includes(false); - }); - }); - }); - - it("0020: should allow unauthenticated user to list only published metadata keys", async () => { - const filter = { - where: { sourceType: "dataset" }, - limits: { limit: 10, skip: 0, sort: { createdAt: "desc" } }, + // ------------------------------------------------------------------------- + // Fields projection + // ------------------------------------------------------------------------- + + it("0400: fields projection returns only requested fields", async () => { + const res = await getMetadataKeys( + { + where: { sourceType: "Dataset" }, + fields: ["key", "sourceType"], + limits: { limit: 5, skip: 0 }, + }, + accessTokenAdmin, + ); + + res.body.should.be.an("array").and.have.length.greaterThan(0); + res.body.forEach((k) => { + k.should.have.property("key"); + k.should.have.property("sourceType"); + k.should.not.have.property("userGroups"); + k.should.not.have.property("userGroupCounts"); + k.should.not.have.property("usageCount"); + }); + }); + + // ------------------------------------------------------------------------- + // Response shape + // ------------------------------------------------------------------------- + + it("0500: response documents have expected shape", async () => { + const res = await getMetadataKeys( + { where: { sourceType: "Dataset" }, limits: { limit: 1, skip: 0 } }, + accessTokenAdmin, + ); + + res.body.should.be.an("array").and.have.length.greaterThan(0); + + const doc = res.body[0]; + doc.should.have.property("key").and.be.a("string"); + doc.should.have.property("sourceType").and.equal("Dataset"); + doc.should.have.property("humanReadableName").and.be.a("string"); + doc.should.have.property("userGroups").and.be.an("array"); + doc.should.have.property("isPublished").and.be.a("boolean"); + doc.should.have.property("usageCount").and.be.a("number"); + }); + + it("0510: usageCount reflects number of datasets using the key", async () => { + // with_number exists in all 3 datasets created in before() + const res = await getMetadataKeys( + { where: { sourceType: "Dataset", key: "with_number" } }, + accessTokenAdmin, + ); + + res.body.should.be.an("array").and.have.length.greaterThan(0); + res.body[0].usageCount.should.equal(3); + }); + + // ------------------------------------------------------------------------- + // Mutation tests + // ------------------------------------------------------------------------- + + it("0600: updating human_name renames the MetadataKey (delete old + insert new)", async () => { + const pid = await createDataset({ + ...TestData.DatasetWithScientificMetadataV4, + ownerGroup: "group1", + accessGroups: [], + isPublished: false, + scientificMetadata: { + rename_test_key: { value: 1, human_name: "Original Name" }, + }, + }); + + // Old key exists + const before = await getMetadataKeys( + { + where: { + sourceType: "Dataset", + key: "rename_test_key", + humanReadableName: "Original Name", + }, + }, + accessTokenAdmin, + ); + before.body.should.have.lengthOf(1); + + await patchDataset(pid, { + scientificMetadata: { + rename_test_key: { value: 1, human_name: "Renamed Name" }, + }, + }); + + // Old key gone + const afterOld = await getMetadataKeys( + { + where: { + sourceType: "Dataset", + key: "rename_test_key", + humanReadableName: "Original Name", + }, + }, + accessTokenAdmin, + ); + afterOld.body.should.have.lengthOf(0); + + // New key exists + const afterNew = await getMetadataKeys( + { + where: { + sourceType: "Dataset", + key: "rename_test_key", + humanReadableName: "Renamed Name", + }, + }, + accessTokenAdmin, + ); + afterNew.body.should.have.lengthOf(1); + afterNew.body[0].usageCount.should.equal(1); + + await deleteDataset(pid); + }); + + it("0610: removing a key from scientificMetadata decrements usageCount", async () => { + const pidA = await createDataset({ + ...TestData.DatasetWithScientificMetadataV4, + ownerGroup: "group1", + accessGroups: [], + isPublished: false, + scientificMetadata: { shared_key: { value: 1 } }, + }); + + const pidB = await createDataset({ + ...TestData.DatasetWithScientificMetadataV4, + ownerGroup: "group1", + accessGroups: [], + isPublished: false, + scientificMetadata: { shared_key: { value: 2 } }, + }); + + const filterKey = { where: { sourceType: "Dataset", key: "shared_key" } }; + + // usageCount starts at 2 + const before = await getMetadataKeys(filterKey, accessTokenAdmin); + before.body.should.have.lengthOf(1); + before.body[0].usageCount.should.equal(2); + + // Remove key from datasetA + await patchDataset(pidA, { scientificMetadata: {} }); + + // usageCount drops to 1, key still exists + const after = await getMetadataKeys(filterKey, accessTokenAdmin); + after.body.should.have.lengthOf(1); + after.body[0].usageCount.should.equal(1); + + await deleteDataset(pidA); + await deleteDataset(pidB); + }); + + it("0620: removing a key from the last dataset deletes the MetadataKey entirely", async () => { + const pid = await createDataset({ + ...TestData.DatasetWithScientificMetadataV4, + ownerGroup: "group1", + accessGroups: [], + isPublished: false, + scientificMetadata: { sole_key: { value: 42 } }, + }); + + const filterKey = { where: { sourceType: "Dataset", key: "sole_key" } }; + + const before = await getMetadataKeys(filterKey, accessTokenAdmin); + before.body.should.have.lengthOf(1); + before.body[0].usageCount.should.equal(1); + + await patchDataset(pid, { scientificMetadata: {} }); + + const after = await getMetadataKeys(filterKey, accessTokenAdmin); + after.body.should.have.lengthOf(0); + + await deleteDataset(pid); + }); + + it("0630: adding a new key to scientificMetadata creates a new MetadataKey", async () => { + const pid = await createDataset({ + ...TestData.DatasetWithScientificMetadataV4, + ownerGroup: "group1", + accessGroups: [], + isPublished: false, + scientificMetadata: { original_key: { value: 1 } }, + }); + + const filterNewKey = { + where: { sourceType: "Dataset", key: "brand_new_key" }, }; - return request(appUrl) - .get( - `/api/v4/metadatakeys?filter=${encodeURIComponent(JSON.stringify(filter))}`, - ) - .set("Accept", "application/json") - .expect(TestData.SuccessfulGetStatusCode) - .expect("Content-Type", /json/) - .then((res) => { - res.body.should.be.an("array").that.satisfies((arr) => { - return arr.every((item) => item.isPublished === true); - }); - }); - }); - - it("0030: should allow authenticated user to list metadata keys they have access", async () => { - const filter = { limits: { limit: 1, skip: 0 } }; - - return request(appUrl) - .get( - `/api/v4/metadatakeys?filter=${encodeURIComponent(JSON.stringify(filter))}`, - ) - .set("Accept", "application/json") - .set({ Authorization: `Bearer ${accessTokenUser1}` }) - .expect(TestData.SuccessfulGetStatusCode) - .expect("Content-Type", /json/) - .then((res) => { - res.body.should.be.an("array").that.satisfies((arr) => { - return arr.every((item) => { - return ( - item.isPublished === true || item.userGroups.includes("group1") - ); - }); - }); - }); - }); - - it("0040: should return empty array when user queries keys they don't have access to", async () => { - const filter = { - where: { sourceType: "dataset", sourceId: datasetIdPrivate }, - limits: { limit: 10, skip: 0 }, + // Key does not exist yet + const before = await getMetadataKeys(filterNewKey, accessTokenAdmin); + before.body.should.have.lengthOf(0); + + await patchDataset(pid, { + scientificMetadata: { + original_key: { value: 1 }, + brand_new_key: { value: 99, human_name: "Brand New Key" }, + }, + }); + + const after = await getMetadataKeys(filterNewKey, accessTokenAdmin); + after.body.should.have.lengthOf(1); + after.body[0].key.should.equal("brand_new_key"); + after.body[0].humanReadableName.should.equal("Brand New Key"); + after.body[0].usageCount.should.equal(1); + + await deleteDataset(pid); + }); + + it("0640: changing ownerGroup updates userGroups — old group removed, new group added", async () => { + const pid = await createDataset({ + ...TestData.DatasetWithScientificMetadataV4, + ownerGroup: "group1", + accessGroups: [], + isPublished: false, + scientificMetadata: { group_change_key: { value: 1 } }, + }); + + const filterKey = { + where: { sourceType: "Dataset", key: "group_change_key" }, }; - return request(appUrl) - .get( - `/api/v4/metadatakeys?filter=${encodeURIComponent(JSON.stringify(filter))}`, - ) - .set("Accept", "application/json") - .set({ Authorization: `Bearer ${accessTokenUser1}` }) - .expect(TestData.SuccessfulGetStatusCode) - .expect("Content-Type", /json/) - .then((res) => { - res.body.should.be.an("array").and.have.length(0); - }); - }); - - it("0040: should return metadatakeys with correct access for user1", async () => { - const filter = { - where: { sourceType: "dataset", sourceId: `${datasetIdUser1}` }, - limits: { limit: 10, skip: 0 }, + const before = await getMetadataKeys(filterKey, accessTokenAdmin); + before.body.should.have.lengthOf(1); + before.body[0].userGroups.should.include("group1"); + before.body[0].userGroups.should.not.include("group2"); + + await patchDataset(pid, { ownerGroup: "group2" }); + + const after = await getMetadataKeys(filterKey, accessTokenAdmin); + after.body.should.have.lengthOf(1); + after.body[0].userGroups.should.include("group2"); + after.body[0].userGroups.should.not.include("group1"); + + // user1 (group1) can no longer see this key + const user1Res = await getMetadataKeys(filterKey, accessTokenUser1); + user1Res.body.should.have.lengthOf(0); + + await deleteDataset(pid); + }); + + it("0650: deleting a dataset decrements usageCount — deletes MetadataKey when it reaches 0", async () => { + const pidA = await createDataset({ + ...TestData.DatasetWithScientificMetadataV4, + ownerGroup: "group1", + accessGroups: [], + isPublished: false, + scientificMetadata: { delete_test_key: { value: 1 } }, + }); + + const pidB = await createDataset({ + ...TestData.DatasetWithScientificMetadataV4, + ownerGroup: "group1", + accessGroups: [], + isPublished: false, + scientificMetadata: { delete_test_key: { value: 2 } }, + }); + + const filterKey = { + where: { sourceType: "Dataset", key: "delete_test_key" }, }; - return request(appUrl) - .get(`/api/v4/metadatakeys?filter=${JSON.stringify(filter)}`) - .set("Accept", "application/json") - .set({ Authorization: `Bearer ${accessTokenUser1}` }) - .expect(TestData.SuccessfulGetStatusCode) - .expect("Content-Type", /json/) - .then((res) => { - res.body.should.be.an("array").and.have.length.of.at.least(1); - }); + const before = await getMetadataKeys(filterKey, accessTokenAdmin); + before.body.should.have.lengthOf(1); + before.body[0].usageCount.should.equal(2); + + // Delete first dataset — usageCount drops to 1, key still exists + await deleteDataset(pidA); + + const afterFirst = await getMetadataKeys(filterKey, accessTokenAdmin); + afterFirst.body.should.have.lengthOf(1); + afterFirst.body[0].usageCount.should.equal(1); + + // Delete second dataset — usageCount hits 0, MetadataKey removed + await deleteDataset(pidB); + + const afterSecond = await getMetadataKeys(filterKey, accessTokenAdmin); + afterSecond.body.should.have.lengthOf(0); }); }); diff --git a/test/TestData.js b/test/TestData.js index a3343722f..a994b82b4 100644 --- a/test/TestData.js +++ b/test/TestData.js @@ -1436,10 +1436,12 @@ const TestData = { unit: "mbar l/s/cm^2", valueSI: 100000, unitSI: "kg / s^3", + human_name: "Pressure SI", }, with_number: { value: 111, unit: "", + human_name: "Sample Number", }, with_string: { value: "222", @@ -1458,17 +1460,18 @@ const TestData = { owner: faker.internet.username(), size: faker.number.int({ min: 0, max: 100000000 }), contactEmail: faker.internet.email(), + principalInvestigator: faker.internet.username(), scientificMetadata: { with_key_value: "some text", with_unit_and_value_si: { value: 100, unit: "mbar l/s/cm^2", - valueSI: 100000, - unitSI: "kg / s^3", + human_name: "Pressure SI", }, with_number: { value: 111, unit: "", + human_name: "Sample Number", }, with_string: { value: "222", From 7daa0c19409e6be61b44a2056ce907dad455f188 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 20 Apr 2026 21:24:49 +0200 Subject: [PATCH 03/29] trim migration script comment and fix failing tests --- ...aset-scientificMetadata-to-metadatakeys.md | 4 ++-- ...aset-scientificMetadata-to-metadatakeys.js | 23 ------------------- src/datasets/datasets.service.spec.ts | 17 ++++++++------ src/metadata-keys/metadatakeys.service.ts | 1 + test/MetadataKeys.js | 7 ++++-- test/TestData.js | 1 - 6 files changed, 18 insertions(+), 35 deletions(-) diff --git a/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md b/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md index b966d6388..7ebd87bde 100644 --- a/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md +++ b/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md @@ -58,9 +58,9 @@ Each `MetadataKey` document tracks: --- -## Pipeline walkthrough +## Migration Pipeline walkthrough -The pipeline uses two datasets as a running example throughout: +The migration pipeline uses two datasets as a running example throughout: | | ownerGroup | accessGroups | isPublished | scientificMetadata keys | | --------- | ---------- | ------------ | ----------- | ----------------------- | diff --git a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js index 904b48886..a63e55002 100644 --- a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js +++ b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js @@ -2,19 +2,15 @@ const SOURCE_COLLECTIONS = ["Dataset"]; function buildPipeline(sourceType) { return [ - // ------------------------------------------------------------------------- // Stage 1: Only process documents that have scientificMetadata - // ------------------------------------------------------------------------- { $match: { scientificMetadata: { $exists: true, $type: "object" }, }, }, - // ------------------------------------------------------------------------- // Stage 2: Flatten scientificMetadata into an array of key/value pairs. // Preserve _id as datasetId so we can count unique datasets later. - // ------------------------------------------------------------------------- { $project: { datasetId: "$_id", @@ -25,15 +21,11 @@ function buildPipeline(sourceType) { }, }, - // ------------------------------------------------------------------------- // Stage 3: One document per (dataset, metadata key) - // ------------------------------------------------------------------------- { $unwind: "$metaArr" }, - // ------------------------------------------------------------------------- // Stage 4: Shape each (dataset, key) document. // userGroups is the union of ownerGroup + accessGroups for this dataset. - // ------------------------------------------------------------------------- { $project: { datasetId: 1, @@ -46,12 +38,7 @@ function buildPipeline(sourceType) { }, }, - // ------------------------------------------------------------------------- // Stage 5: One document per (dataset, key, group). - // This is the pivot that makes per-group counting possible. - // preserveNullAndEmptyArrays keeps datasets with no groups so - // usageCount stays accurate even for group-less datasets. - // ------------------------------------------------------------------------- { $unwind: { path: "$userGroups", @@ -59,21 +46,17 @@ function buildPipeline(sourceType) { }, }, - // ------------------------------------------------------------------------- // Stage 6: Filter out the empty-string sentinel that comes from ownerGroup // being null/missing. Real group names are never empty strings. - // ------------------------------------------------------------------------- { $match: { userGroups: { $nin: [null, ""] }, }, }, - // ------------------------------------------------------------------------- // Stage 7: Group by (metaKeyId, group). // groupCount = how many datasets with this group use this key. // datasetIds = set of distinct dataset IDs (for accurate usageCount). - // ------------------------------------------------------------------------- { $group: { _id: { @@ -90,12 +73,10 @@ function buildPipeline(sourceType) { }, }, - // ------------------------------------------------------------------------- // Stage 8: Group by metaKeyId to reassemble one document per metadata key. // userGroupCountsArr will become the userGroupCounts Map. // datasetIdSets is a list of per-group dataset ID sets — merged in the // next stage to compute total unique dataset count (usageCount). - // ------------------------------------------------------------------------- { $group: { _id: "$_id.metaKeyId", @@ -110,11 +91,9 @@ function buildPipeline(sourceType) { }, }, - // ------------------------------------------------------------------------- // Stage 9: Final projection. // userGroupCounts: [{k,v}] array → plain object (stored as Map in Mongoose). // usageCount: union all per-group datasetId sets, count distinct IDs. - // ------------------------------------------------------------------------- { $project: { _id: 1, @@ -138,12 +117,10 @@ function buildPipeline(sourceType) { }, }, - // ------------------------------------------------------------------------- // Stage 10: Merge into MetadataKeys. // whenMatched handles the (future) case where multiple SOURCE_COLLECTIONS // produce the same _id. Not possible today since sourceType is part of // the _id, but kept correct for when more collections are added. - // ------------------------------------------------------------------------- { $merge: { into: "MetadataKeys", diff --git a/src/datasets/datasets.service.spec.ts b/src/datasets/datasets.service.spec.ts index d5cfef7ab..9370f4580 100644 --- a/src/datasets/datasets.service.spec.ts +++ b/src/datasets/datasets.service.spec.ts @@ -96,6 +96,15 @@ const mockDataset: DatasetClass = { dataQualityMetrics: 1, }; +const mockDatasetModel = function (data: DatasetClass) { + return { + ...data, + save: jest.fn().mockResolvedValue(data), + toObject: jest.fn().mockReturnValue(data), + }; +}; +mockDatasetModel.collection = { name: "Dataset" }; + describe("DatasetsService", () => { let service: DatasetsService; // eslint-disable-next-line @typescript-eslint/no-unused-vars @@ -107,13 +116,7 @@ describe("DatasetsService", () => { ConfigService, { provide: getModelToken("DatasetClass"), - useValue: function (data: DatasetClass) { - return { - ...data, - save: jest.fn().mockResolvedValue(data), - toObject: jest.fn().mockReturnValue(data), - }; - }, + useValue: mockDatasetModel, }, DatasetsService, DatasetsAccessService, diff --git a/src/metadata-keys/metadatakeys.service.ts b/src/metadata-keys/metadatakeys.service.ts index 0a2b84a63..f7289a613 100644 --- a/src/metadata-keys/metadatakeys.service.ts +++ b/src/metadata-keys/metadatakeys.service.ts @@ -119,6 +119,7 @@ export class MetadataKeysService { $setOnInsert: addCreatedByFields( { _id: id as unknown as MetadataKeyDocument["_id"], + id, key, sourceType, humanReadableName, diff --git a/test/MetadataKeys.js b/test/MetadataKeys.js index 08b6f1e2a..e8e7ef0f7 100644 --- a/test/MetadataKeys.js +++ b/test/MetadataKeys.js @@ -60,7 +60,7 @@ async function getMetadataKeys(filter, token) { async function createDataset(dataset) { const res = await request(appUrl) - .post("/api/v3/Datasets") + .post("/api/v4/datasets") .send(dataset) .set("Accept", "application/json") .set({ Authorization: `Bearer ${accessTokenAdmin}` }) @@ -79,7 +79,7 @@ async function patchDataset(pid, payload) { async function deleteDataset(pid) { return request(appUrl) - .delete("/api/v3/Datasets/" + encodeURIComponent(pid)) + .delete("/api/v4/datasets/" + encodeURIComponent(pid)) .set("Accept", "application/json") .set({ Authorization: `Bearer ${accessTokenArchiveManager}` }) .expect(TestData.SuccessfulDeleteStatusCode); @@ -87,6 +87,9 @@ async function deleteDataset(pid) { describe("2000: MetadataKeys: Access Control and Search", () => { before(async () => { + await db.collection("Dataset").deleteMany({}); + await db.collection("MetadataKeys").deleteMany({}); + accessTokenAdmin = await utils.getToken(appUrl, { username: "adminIngestor", password: TestData.Accounts["adminIngestor"]["password"], diff --git a/test/TestData.js b/test/TestData.js index a994b82b4..1df4f5404 100644 --- a/test/TestData.js +++ b/test/TestData.js @@ -1460,7 +1460,6 @@ const TestData = { owner: faker.internet.username(), size: faker.number.int({ min: 0, max: 100000000 }), contactEmail: faker.internet.email(), - principalInvestigator: faker.internet.username(), scientificMetadata: { with_key_value: "some text", with_unit_and_value_si: { From af3fef79c628e20408fd64d2e58617f1efea64ec Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 20 Apr 2026 21:37:09 +0200 Subject: [PATCH 04/29] fix typo and api test --- ...5401-sync-dataset-scientificMetadata-to-metadatakeys.md | 2 +- test/DatasetV4.js | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md b/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md index 7ebd87bde..710a24744 100644 --- a/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md +++ b/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md @@ -140,7 +140,7 @@ Extract `key`, `humanReadableName`, and compute `userGroups` as the union of `ow { datasetId: "uuid-B", key: "temperature", isPublished: false, humanReadableName: "Temperature", userGroups: ["group-1"] } ``` -> `ownerGroup` is mandatory field so no null fallback is needed. `accessGroups` is optional so it falls back to `[]`. +> `ownerGroup` is a mandatory field, so no null fallback is needed. `accessGroups` is optional so it falls back to `[]`. --- diff --git a/test/DatasetV4.js b/test/DatasetV4.js index 6755a54cb..de1316b42 100644 --- a/test/DatasetV4.js +++ b/test/DatasetV4.js @@ -1331,6 +1331,7 @@ describe("2500: Datasets v4 tests", () => { unit: "mg", valueSI: 0.0006, unitSI: "kg", + human_name: "Pressure SI", }); res.body.scientificMetadata.with_number.should.deep.eq({ value: 111, @@ -1348,6 +1349,7 @@ describe("2500: Datasets v4 tests", () => { with_unit_and_value_si: { value: -2, unit: "km", + human_name: "new human name", }, }, }; @@ -1369,6 +1371,7 @@ describe("2500: Datasets v4 tests", () => { unit: "km", valueSI: -2000, unitSI: "m", + human_name: "new human name", }); }); }); @@ -1381,6 +1384,7 @@ describe("2500: Datasets v4 tests", () => { unit: "cm", valueSI: null, unitSI: null, + human_name: "new human name", }, with_number: null, }, @@ -1407,6 +1411,7 @@ describe("2500: Datasets v4 tests", () => { unit: "cm", valueSI: -0.02, unitSI: "m", + human_name: "new human name", }); res.body.scientificMetadata.should.not.have.property("with_number"); }); @@ -1590,6 +1595,7 @@ describe("2500: Datasets v4 tests", () => { unit: "cm", valueSI: 555, unitSI: "cmcm", + human_name: "new human name", }, }, }; @@ -1610,6 +1616,7 @@ describe("2500: Datasets v4 tests", () => { unit: "cm", valueSI: 0.22, unitSI: "m", + human_name: "new human name", }); }); }); From 8320494449e9de3c4c1c033474127bb03e3ce5db Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 20 Apr 2026 21:59:04 +0200 Subject: [PATCH 05/29] set ownerGroup as required with nonEmpty value --- src/common/dto/ownable.dto.ts | 3 ++- src/common/schemas/ownable.schema.ts | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/common/dto/ownable.dto.ts b/src/common/dto/ownable.dto.ts index b3a09e95f..c585e6072 100644 --- a/src/common/dto/ownable.dto.ts +++ b/src/common/dto/ownable.dto.ts @@ -1,5 +1,5 @@ import { ApiProperty } from "@nestjs/swagger"; -import { IsOptional, IsString } from "class-validator"; +import { IsNotEmpty, IsOptional, IsString } from "class-validator"; export class OwnableDto { @ApiProperty({ @@ -8,6 +8,7 @@ export class OwnableDto { description: "Name of the group owning this item.", }) @IsString() + @IsNotEmpty() readonly ownerGroup: string; @ApiProperty({ diff --git a/src/common/schemas/ownable.schema.ts b/src/common/schemas/ownable.schema.ts index 137c9b5f0..f12254b28 100644 --- a/src/common/schemas/ownable.schema.ts +++ b/src/common/schemas/ownable.schema.ts @@ -11,6 +11,7 @@ export class OwnableClass extends QueryableClass { @Prop({ type: String, index: true, + required: true, }) ownerGroup: string; From 386ccf0ed11a9458c5c84845716b39023cde19e1 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 20 Apr 2026 21:59:19 +0200 Subject: [PATCH 06/29] update doc --- ...145401-sync-dataset-scientificMetadata-to-metadatakeys.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js index a63e55002..c3822810b 100644 --- a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js +++ b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js @@ -46,8 +46,9 @@ function buildPipeline(sourceType) { }, }, - // Stage 6: Filter out the empty-string sentinel that comes from ownerGroup - // being null/missing. Real group names are never empty strings. + // Stage 6: Filter out null and empty-string groups. + // Note: datasets with no valid groups are excluded from usageCount. + // This is acceptable since ownerGroup is a required field in practice. { $match: { userGroups: { $nin: [null, ""] }, From 34e0a24df583dd95dfbad0854d42437ce4ec623e Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 20 Apr 2026 21:59:26 +0200 Subject: [PATCH 07/29] fix api test --- test/DatasetV4.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/DatasetV4.js b/test/DatasetV4.js index de1316b42..d373c5e73 100644 --- a/test/DatasetV4.js +++ b/test/DatasetV4.js @@ -1331,7 +1331,7 @@ describe("2500: Datasets v4 tests", () => { unit: "mg", valueSI: 0.0006, unitSI: "kg", - human_name: "Pressure SI", + human_name: "Sample Number", }); res.body.scientificMetadata.with_number.should.deep.eq({ value: 111, From 74851db68970af8b8ce8163f36c19adecaf41bf9 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 20 Apr 2026 22:27:49 +0200 Subject: [PATCH 08/29] fix api test --- src/common/schemas/ownable.schema.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/common/schemas/ownable.schema.ts b/src/common/schemas/ownable.schema.ts index f12254b28..137c9b5f0 100644 --- a/src/common/schemas/ownable.schema.ts +++ b/src/common/schemas/ownable.schema.ts @@ -11,7 +11,6 @@ export class OwnableClass extends QueryableClass { @Prop({ type: String, index: true, - required: true, }) ownerGroup: string; From e66ebacfe433e3b691048bad539bfbfc41dbfd81 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Tue, 21 Apr 2026 09:18:17 +0200 Subject: [PATCH 09/29] fix api test --- test/DatasetV4.js | 1 + 1 file changed, 1 insertion(+) diff --git a/test/DatasetV4.js b/test/DatasetV4.js index d373c5e73..0ed7b694c 100644 --- a/test/DatasetV4.js +++ b/test/DatasetV4.js @@ -1309,6 +1309,7 @@ describe("2500: Datasets v4 tests", () => { with_unit_and_value_si: { value: 600, unit: "mg", + human_name: "Sample Number", }, }, }; From 125c50714e33e7ad1d2b33dbf48541397a57e16f Mon Sep 17 00:00:00 2001 From: junjiequan Date: Tue, 21 Apr 2026 09:49:42 +0200 Subject: [PATCH 10/29] fix api test --- test/DatasetV4.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/DatasetV4.js b/test/DatasetV4.js index 0ed7b694c..bf8f6b57a 100644 --- a/test/DatasetV4.js +++ b/test/DatasetV4.js @@ -1309,7 +1309,6 @@ describe("2500: Datasets v4 tests", () => { with_unit_and_value_si: { value: 600, unit: "mg", - human_name: "Sample Number", }, }, }; @@ -1332,11 +1331,12 @@ describe("2500: Datasets v4 tests", () => { unit: "mg", valueSI: 0.0006, unitSI: "kg", - human_name: "Sample Number", + human_name: "Pressure SI", }); res.body.scientificMetadata.with_number.should.deep.eq({ value: 111, unit: "", + human_name: "Sample Number", }); res.body.datasetlifecycle.should.have .property("storageLocation") From 9a41f76fbd9470d64848a5bbac1717f646cf49f7 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Thu, 23 Apr 2026 16:07:10 +0200 Subject: [PATCH 11/29] address sourcer comment Co-authored-by: Copilot --- src/metadata-keys/metadatakeys.service.ts | 39 +++++++++++++++-------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/src/metadata-keys/metadatakeys.service.ts b/src/metadata-keys/metadatakeys.service.ts index f7289a613..26844b547 100644 --- a/src/metadata-keys/metadatakeys.service.ts +++ b/src/metadata-keys/metadatakeys.service.ts @@ -278,12 +278,21 @@ export class MetadataKeysService { newDoc: MetadataSourceDoc, ): Promise { const { sourceType } = newDoc; - const ids = sharedKeys.map((k) => { - const humanReadableName = + const humanNameChangedKeys: string[] = []; + const humanNameUnchangedKeys: string[] = []; + + for (const k of sharedKeys) { + const oldHumanName = + (oldDoc.metadata[k] as ScientificMetadataEntry)?.human_name ?? ""; + const newHumanName = (newDoc.metadata[k] as ScientificMetadataEntry)?.human_name ?? ""; - return this.buildId(sourceType, k, humanReadableName); - }); - const filter = { _id: { $in: ids } }; + + if (oldHumanName === newHumanName) { + humanNameUnchangedKeys.push(k); + } else { + humanNameChangedKeys.push(k); + } + } const addedGroups = newDoc.userGroups.filter( (g) => !oldDoc.userGroups.includes(g), @@ -294,7 +303,17 @@ export class MetadataKeysService { const publishedFlippedOn = !oldDoc.isPublished && newDoc.isPublished; const hasGroupChanges = addedGroups.length > 0 || removedGroups.length > 0; - if (hasGroupChanges || publishedFlippedOn) { + if ( + humanNameUnchangedKeys.length > 0 && + (hasGroupChanges || publishedFlippedOn) + ) { + const ids = humanNameUnchangedKeys.map((k) => { + const humanReadableName = + (newDoc.metadata[k] as ScientificMetadataEntry)?.human_name ?? ""; + return this.buildId(sourceType, k, humanReadableName); + }); + const filter = { _id: { $in: ids } }; + await this.metadataKeyModel.updateMany(filter, { ...(addedGroups.length > 0 && { $addToSet: { userGroups: { $each: addedGroups } }, @@ -318,14 +337,6 @@ export class MetadataKeysService { // humanReadableName is part of _id, so a change means a different document. // Treat it as delete the old entry + insert the new one. - const humanNameChangedKeys = sharedKeys.filter((k) => { - const oldName = - (oldDoc.metadata[k] as ScientificMetadataEntry)?.human_name ?? ""; - const newName = - (newDoc.metadata[k] as ScientificMetadataEntry)?.human_name ?? ""; - return oldName !== newName; - }); - if (humanNameChangedKeys.length > 0) { await Promise.all([ this.deleteMany({ From 22e8a2f8247152c44467c8b0232fa88897a3070b Mon Sep 17 00:00:00 2001 From: junjiequan Date: Tue, 28 Apr 2026 10:49:31 +0200 Subject: [PATCH 12/29] remove unncessary request import --- test/AttachmentV4.js | 1 - 1 file changed, 1 deletion(-) diff --git a/test/AttachmentV4.js b/test/AttachmentV4.js index 99cc09106..ec9fb73f6 100644 --- a/test/AttachmentV4.js +++ b/test/AttachmentV4.js @@ -2,7 +2,6 @@ const utils = require("./LoginUtils"); const { TestData } = require("./TestData"); const { v4: uuidv4 } = require("uuid"); -const request = require("supertest"); let accessTokenAdminIngestor = null, accessTokenUser1 = null, From 352437df7e54c8aa490ad18ba101075728507621 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Tue, 28 Apr 2026 14:26:39 +0200 Subject: [PATCH 13/29] add batch for scientificMetadata sync migration --- ...aset-scientificMetadata-to-metadatakeys.md | 29 +++----- ...aset-scientificMetadata-to-metadatakeys.js | 72 +++++++++++-------- 2 files changed, 52 insertions(+), 49 deletions(-) diff --git a/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md b/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md index 710a24744..ffe4215fc 100644 --- a/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md +++ b/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md @@ -69,18 +69,7 @@ The migration pipeline uses two datasets as a running example throughout: --- -### Stage 1 — Filter - -Keep only documents where `scientificMetadata` exists and is an object. - -``` -Input: all Dataset documents -Output: documents where scientificMetadata exists -``` - ---- - -### Stage 2 — Flatten scientificMetadata +### Stage 1 — Flatten scientificMetadata Expose `_id` as `datasetId` and convert `scientificMetadata` from an object to an array of `{k, v}` pairs using `$objectToArray`. @@ -112,7 +101,7 @@ Expose `_id` as `datasetId` and convert `scientificMetadata` from an object to a --- -### Stage 3 — Unwind metaArr +### Stage 2 — Unwind metaArr One document per `(dataset, metadata key)`. @@ -126,7 +115,7 @@ Output: 2 documents --- -### Stage 4 — Shape each (dataset, key) document +### Stage 3 — Shape each (dataset, key) document Extract `key`, `humanReadableName`, and compute `userGroups` as the union of `ownerGroup` and `accessGroups`. @@ -144,7 +133,7 @@ Extract `key`, `humanReadableName`, and compute `userGroups` as the union of `ow --- -### Stage 5 — Unwind userGroups +### Stage 4 — Unwind userGroups One document per `(dataset, key, group)`. This is the pivot that makes per-group counting possible. @@ -164,7 +153,7 @@ Output --- -### Stage 6 — Filter null userGroups +### Stage 5 — Filter null userGroups Drop documents where `userGroups` is `null`. This only occurs when `preserveNullAndEmptyArrays` retains a document from a dataset that had no groups at all. @@ -175,7 +164,7 @@ Output: only documents where userGroups is a real group name --- -### Stage 7 — Group by (metaKeyId, group) +### Stage 6 — Group by (metaKeyId, group) Each bucket is one unique `(key, humanReadableName, group)` combination. `groupCount` = how many datasets with this group reference this key. @@ -199,7 +188,7 @@ Output (4 buckets — one per unique metaKeyId + group) --- -### Stage 8 — Group by metaKeyId +### Stage 7 — Group by metaKeyId Reassemble one document per metadata key by collecting the per-group buckets from stage 7. @@ -229,7 +218,7 @@ Output (2 documents — one per unique key) --- -### Stage 9 — Final projection +### Stage 8 — Final projection - `userGroupCounts`: converts `[{k, v}]` array to a plain object (Mongoose stores this as a `Map`) - `usageCount`: unions all per-group `datasetId` sets and counts distinct IDs @@ -259,7 +248,7 @@ Output (2 documents — one per unique key) --- -### Stage 10 — Merge into MetadataKeys +### Stage 9 — Merge into MetadataKeys - `whenNotMatched: insert` — new documents are inserted as-is - `whenMatched` — additively merges counts when two `SOURCE_COLLECTIONS` produce the same `_id` diff --git a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js index c3822810b..799c6b031 100644 --- a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js +++ b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js @@ -2,14 +2,7 @@ const SOURCE_COLLECTIONS = ["Dataset"]; function buildPipeline(sourceType) { return [ - // Stage 1: Only process documents that have scientificMetadata - { - $match: { - scientificMetadata: { $exists: true, $type: "object" }, - }, - }, - - // Stage 2: Flatten scientificMetadata into an array of key/value pairs. + // Stage 1: Flatten scientificMetadata into an array of key/value pairs. // Preserve _id as datasetId so we can count unique datasets later. { $project: { @@ -21,10 +14,10 @@ function buildPipeline(sourceType) { }, }, - // Stage 3: One document per (dataset, metadata key) + // Stage 2: One document per (dataset, metadata key) { $unwind: "$metaArr" }, - // Stage 4: Shape each (dataset, key) document. + // Stage 3: Shape each (dataset, key) document. // userGroups is the union of ownerGroup + accessGroups for this dataset. { $project: { @@ -38,7 +31,7 @@ function buildPipeline(sourceType) { }, }, - // Stage 5: One document per (dataset, key, group). + // Stage 4: One document per (dataset, key, group). { $unwind: { path: "$userGroups", @@ -46,7 +39,7 @@ function buildPipeline(sourceType) { }, }, - // Stage 6: Filter out null and empty-string groups. + // Stage 5: Filter out null and empty-string groups. // Note: datasets with no valid groups are excluded from usageCount. // This is acceptable since ownerGroup is a required field in practice. { @@ -55,7 +48,7 @@ function buildPipeline(sourceType) { }, }, - // Stage 7: Group by (metaKeyId, group). + // Stage 6: Group by (metaKeyId, group). // groupCount = how many datasets with this group use this key. // datasetIds = set of distinct dataset IDs (for accurate usageCount). { @@ -74,7 +67,7 @@ function buildPipeline(sourceType) { }, }, - // Stage 8: Group by metaKeyId to reassemble one document per metadata key. + // Stage 7: Group by metaKeyId to reassemble one document per metadata key. // userGroupCountsArr will become the userGroupCounts Map. // datasetIdSets is a list of per-group dataset ID sets — merged in the // next stage to compute total unique dataset count (usageCount). @@ -92,7 +85,7 @@ function buildPipeline(sourceType) { }, }, - // Stage 9: Final projection. + // Stage 8: Final projection. // userGroupCounts: [{k,v}] array → plain object (stored as Map in Mongoose). // usageCount: union all per-group datasetId sets, count distinct IDs. { @@ -220,6 +213,7 @@ function buildPipeline(sourceType) { module.exports = { async up(db) { + const BATCH_SIZE = 10000; const start = Date.now(); const elapsed = () => `${((Date.now() - start) / 1000).toFixed(1)}s`; @@ -245,24 +239,44 @@ module.exports = { `[${elapsed()}] Processing ${total.toLocaleString()} documents from ${collection}...`, ); - const timer = setInterval( - () => - console.log( - `[${elapsed()}] ${collection} migration still running...`, - ), - 5000, - ); + let lastId = null; + let processed = 0; + + while (true) { + const match = { + scientificMetadata: { $exists: true, $type: "object" }, + ...(lastId && { _id: { $gt: lastId } }), + }; + + const batch = await db + .collection(collection) + .find(match) + .sort({ _id: 1 }) + .limit(BATCH_SIZE) + .project({ _id: 1 }) + .toArray(); + + if (batch.length === 0) break; + + const batchIds = batch.map((d) => d._id); - try { await db .collection(collection) - .aggregate(buildPipeline(collection), { - allowDiskUse: true, - maxTimeMS: 0, - }) + .aggregate( + [ + { $match: { _id: { $in: batchIds } } }, + ...buildPipeline(collection).slice(1), + ], + { allowDiskUse: true, maxTimeMS: 0 }, + ) .toArray(); - } finally { - clearInterval(timer); + + lastId = batch[batch.length - 1]._id; + processed += batch.length; + + console.log( + `[${elapsed()}] ${collection}: ${processed.toLocaleString()}/${total.toLocaleString()}`, + ); } console.log(`[${elapsed()}] ✅ ${collection} done`); From 40a0c6d363f4cf23b9ac36b2868b7a71d1fe1984 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Tue, 28 Apr 2026 14:27:07 +0200 Subject: [PATCH 14/29] address comment --- src/metadata-keys/metadatakeys.service.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/metadata-keys/metadatakeys.service.ts b/src/metadata-keys/metadatakeys.service.ts index 26844b547..1b6e097c8 100644 --- a/src/metadata-keys/metadatakeys.service.ts +++ b/src/metadata-keys/metadatakeys.service.ts @@ -196,8 +196,8 @@ export class MetadataKeysService { * Called when a dataset is updated. * * Diffs the old and new metadata to produce three disjoint key sets: - * - added: only in newDoc → insertManyFromSource - * - removed: only in oldDoc → deleteMany + * - added: only in newDoc(after change) → insertManyFromSource + * - removed: only in oldDoc(before change) → deleteMany * - shared: in both → updateSharedKeys (handles group / isPublished * / humanReadableName changes) * From b5446eddbef2cd5602c1a00cd01cea7cca7794f1 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Tue, 28 Apr 2026 15:20:00 +0200 Subject: [PATCH 15/29] minor fix --- ...420145401-sync-dataset-scientificMetadata-to-metadatakeys.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js index 799c6b031..bd3a92ede 100644 --- a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js +++ b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js @@ -111,7 +111,7 @@ function buildPipeline(sourceType) { }, }, - // Stage 10: Merge into MetadataKeys. + // Stage 9: Merge into MetadataKeys. // whenMatched handles the (future) case where multiple SOURCE_COLLECTIONS // produce the same _id. Not possible today since sourceType is part of // the _id, but kept correct for when more collections are added. From d51d0699737f88116d52208334f90d42dc261004 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Wed, 29 Apr 2026 11:13:54 +0200 Subject: [PATCH 16/29] Revert "set ownerGroup as required with nonEmpty value" This reverts commit 8320494449e9de3c4c1c033474127bb03e3ce5db. --- src/common/dto/ownable.dto.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/common/dto/ownable.dto.ts b/src/common/dto/ownable.dto.ts index c585e6072..b3a09e95f 100644 --- a/src/common/dto/ownable.dto.ts +++ b/src/common/dto/ownable.dto.ts @@ -1,5 +1,5 @@ import { ApiProperty } from "@nestjs/swagger"; -import { IsNotEmpty, IsOptional, IsString } from "class-validator"; +import { IsOptional, IsString } from "class-validator"; export class OwnableDto { @ApiProperty({ @@ -8,7 +8,6 @@ export class OwnableDto { description: "Name of the group owning this item.", }) @IsString() - @IsNotEmpty() readonly ownerGroup: string; @ApiProperty({ From ff1a457204f659ba1cb6df46c89e973d5ce4c28b Mon Sep 17 00:00:00 2001 From: junjiequan Date: Wed, 29 Apr 2026 14:55:54 +0200 Subject: [PATCH 17/29] revert existingDataset in findByIdAndUpdate --- src/datasets/datasets.service.ts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/datasets/datasets.service.ts b/src/datasets/datasets.service.ts index 53443083f..876c9c6d7 100644 --- a/src/datasets/datasets.service.ts +++ b/src/datasets/datasets.service.ts @@ -520,6 +520,13 @@ export class DatasetsService { | PartialUpdateDatasetWithHistoryDto, unmodifiedSince?: Date, ): Promise { + const existingDataset = await this.datasetModel.findOne({ pid: id }).exec(); + // check if we were able to find the dataset + if (!existingDataset) { + // no luck. we need to create a new dataset + throw new NotFoundException(`Dataset #${id} not found`); + } + const username = (this.request.user as JWTUser).username; // NOTE: When doing findByIdAndUpdate in mongoose it does reset the subdocuments to default values if no value is provided From 43ea1142f5c0ce2f678497fc80cb4597cee8268c Mon Sep 17 00:00:00 2001 From: junjiequan Date: Wed, 29 Apr 2026 14:56:36 +0200 Subject: [PATCH 18/29] remove unnecessary comment --- src/datasets/datasets.service.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/datasets/datasets.service.ts b/src/datasets/datasets.service.ts index 876c9c6d7..4748d3140 100644 --- a/src/datasets/datasets.service.ts +++ b/src/datasets/datasets.service.ts @@ -521,9 +521,7 @@ export class DatasetsService { unmodifiedSince?: Date, ): Promise { const existingDataset = await this.datasetModel.findOne({ pid: id }).exec(); - // check if we were able to find the dataset if (!existingDataset) { - // no luck. we need to create a new dataset throw new NotFoundException(`Dataset #${id} not found`); } From 921d11b840061053c9ffa5de96233348e512d696 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Thu, 30 Apr 2026 10:34:21 +0200 Subject: [PATCH 19/29] add back existingDataset & modify unit test accordingly --- src/datasets/datasets.service.spec.ts | 9 ++++++--- src/datasets/datasets.service.ts | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/datasets/datasets.service.spec.ts b/src/datasets/datasets.service.spec.ts index 745b77545..c8d087a53 100644 --- a/src/datasets/datasets.service.spec.ts +++ b/src/datasets/datasets.service.spec.ts @@ -178,11 +178,11 @@ describe("DatasetsService", () => { ).toBe("Already Encoded"); }); - it("should throw NotFoundException if no document is found for update and no unmodifiedSince is provided", async () => { + it("should throw NotFoundException if no document is found", async () => { const updateDto = { datasetName: "Updated Name" }; - model.findOneAndUpdate = jest + model.findOne = jest .fn() - .mockReturnValue({ exec: jest.fn().mockReturnValue(null) }); + .mockReturnValue({ exec: jest.fn().mockResolvedValue(null) }); await expect( service.findByIdAndUpdate("testId", updateDto), ).rejects.toThrow(NotFoundException); @@ -191,6 +191,9 @@ describe("DatasetsService", () => { it("should throw PreconditionedFailed if no patched dataset is returned (indicating a concurrent modification)", async () => { const updateDto = { datasetName: "Updated Name" }; const unmodifiedSince = new Date("2021-11-11T12:29:02.083Z"); + model.findOne = jest + .fn() + .mockReturnValue({ exec: jest.fn().mockResolvedValue(mockDataset) }); model.findOneAndUpdate = jest .fn() .mockReturnValue({ exec: jest.fn().mockReturnValue(null) }); diff --git a/src/datasets/datasets.service.ts b/src/datasets/datasets.service.ts index 4748d3140..1603eac94 100644 --- a/src/datasets/datasets.service.ts +++ b/src/datasets/datasets.service.ts @@ -520,13 +520,13 @@ export class DatasetsService { | PartialUpdateDatasetWithHistoryDto, unmodifiedSince?: Date, ): Promise { + const username = (this.request.user as JWTUser).username; + const existingDataset = await this.datasetModel.findOne({ pid: id }).exec(); if (!existingDataset) { throw new NotFoundException(`Dataset #${id} not found`); } - const username = (this.request.user as JWTUser).username; - // NOTE: When doing findByIdAndUpdate in mongoose it does reset the subdocuments to default values if no value is provided // https://stackoverflow.com/questions/57324321/mongoose-overwriting-data-in-mongodb-with-default-values-in-subdocuments let queryFilter: FilterQuery = { pid: id }; @@ -545,7 +545,7 @@ export class DatasetsService { // check if we were able to find the dataset (matching the precondition, if supplied) and update it if (!patchedDataset) { if (!unmodifiedSince) { - throw new NotFoundException(`Dataset #${id} not found`); + throw new NotFoundException(`Dataset #${id} failed to update.`); } throw new PreconditionFailedException( `Dataset #${id} has been modified on the server since ${unmodifiedSince.toUTCString()}.`, From d90c3cdeadb4d48cdc4e0aecf7fc4ab4d56d75f4 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Thu, 30 Apr 2026 14:29:18 +0200 Subject: [PATCH 20/29] fix migration pipe --- ...420145401-sync-dataset-scientificMetadata-to-metadatakeys.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js index bd3a92ede..b41877970 100644 --- a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js +++ b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js @@ -265,7 +265,7 @@ module.exports = { .aggregate( [ { $match: { _id: { $in: batchIds } } }, - ...buildPipeline(collection).slice(1), + ...buildPipeline(collection), ], { allowDiskUse: true, maxTimeMS: 0 }, ) From 1aee8bcfe28b423ec80aa74e8b1e38d0ee8ddca1 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Thu, 30 Apr 2026 15:35:22 +0200 Subject: [PATCH 21/29] include MAX_USER_GROUPS_PER_METADATA_KEY logic for metdataKeys migration script --- ...aset-scientificMetadata-to-metadatakeys.md | 11 ++ ...aset-scientificMetadata-to-metadatakeys.js | 119 +++++++++++------- 2 files changed, 82 insertions(+), 48 deletions(-) diff --git a/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md b/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md index ffe4215fc..c753c9cc1 100644 --- a/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md +++ b/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md @@ -255,6 +255,17 @@ Output (2 documents — one per unique key) > This is not triggered today since `sourceType` is part of `_id`, making collisions between collections impossible. It is kept correct for when `Proposal`, `Sample`, or other collections are added as sources. +**Group cap behaviour** + +When merging, if the existing document's `usageCount` exceeds `MAX_USER_GROUPS_PER_METADATA_KEY`: + +- `userGroups` is frozen — no new groups are added +- `userGroupCounts` is frozen — no new counts are merged +- `isPublished` is forced to `true` — key remains publicly discoverable +- `usageCount` continues to increment regardless + +This prevents unbounded document growth while ensuring heavily-used keys remain visible to all users via `isPublished: true`. + ```js // whenMatched userGroupCounts example // existing: { "group-1": 3, "group-2": 1 } diff --git a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js index b41877970..e6ed37bbf 100644 --- a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js +++ b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js @@ -1,4 +1,6 @@ const SOURCE_COLLECTIONS = ["Dataset"]; +const BATCH_SIZE = 10000; +const MAX_USER_GROUPS_PER_METADATA_KEY = 1000; function buildPipeline(sourceType) { return [ @@ -131,73 +133,95 @@ function buildPipeline(sourceType) { updatedBy: { $literal: "migration" }, updatedAt: { $toDate: "$$NOW" }, - // Merge group arrays + isPublished: { + $cond: { + if: { + $gt: ["$usageCount", MAX_USER_GROUPS_PER_METADATA_KEY], + }, + then: true, + else: { $or: ["$isPublished", "$$new.isPublished"] }, + }, + }, + userGroups: { - $setUnion: ["$userGroups", "$$new.userGroups"], + $cond: { + if: { + $gt: ["$usageCount", MAX_USER_GROUPS_PER_METADATA_KEY], + }, + then: "$userGroups", + else: { $setUnion: ["$userGroups", "$$new.userGroups"] }, + }, }, // Additively merge userGroupCounts — sum counts per group // across both the existing and incoming documents. userGroupCounts: { - $arrayToObject: { - $map: { - input: { - $setUnion: [ - { - $map: { - input: { $objectToArray: "$userGroupCounts" }, - as: "e", - in: "$$e.k", - }, - }, - { - $map: { - input: { - $objectToArray: "$$new.userGroupCounts", + $cond: { + if: { + $gt: ["$usageCount", MAX_USER_GROUPS_PER_METADATA_KEY], + }, + then: "$userGroupCounts", + else: { + $arrayToObject: { + $map: { + input: { + $setUnion: [ + { + $map: { + input: { + $objectToArray: "$userGroupCounts", + }, + as: "e", + in: "$$e.k", + }, }, - as: "e", - in: "$$e.k", - }, + { + $map: { + input: { + $objectToArray: "$$new.userGroupCounts", + }, + as: "e", + in: "$$e.k", + }, + }, + ], }, - ], - }, - as: "group", - in: { - k: "$$group", - v: { - $add: [ - { - $ifNull: [ + as: "group", + in: { + k: "$$group", + v: { + $add: [ { - $getField: { - field: "$$group", - input: "$userGroupCounts", - }, + $ifNull: [ + { + $getField: { + field: "$$group", + input: "$userGroupCounts", + }, + }, + 0, + ], }, - 0, - ], - }, - { - $ifNull: [ { - $getField: { - field: "$$group", - input: "$$new.userGroupCounts", - }, + $ifNull: [ + { + $getField: { + field: "$$group", + input: "$$new.userGroupCounts", + }, + }, + 0, + ], }, - 0, ], }, - ], + }, }, }, }, }, }, - // Any source being published makes the key published - isPublished: { $or: ["$isPublished", "$$new.isPublished"] }, - // Add incoming dataset count to existing usageCount: { $add: ["$usageCount", "$$new.usageCount"] }, }, @@ -213,7 +237,6 @@ function buildPipeline(sourceType) { module.exports = { async up(db) { - const BATCH_SIZE = 10000; const start = Date.now(); const elapsed = () => `${((Date.now() - start) / 1000).toFixed(1)}s`; From c6905fa1677923560ad0d28b0f8cf7e488068115 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 4 May 2026 17:05:58 +0200 Subject: [PATCH 22/29] update seed migration script --- ...aset-scientificMetadata-to-metadatakeys.js | 144 +++++++++--------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js index e6ed37bbf..712ca4328 100644 --- a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js +++ b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js @@ -1,6 +1,5 @@ const SOURCE_COLLECTIONS = ["Dataset"]; const BATCH_SIZE = 10000; -const MAX_USER_GROUPS_PER_METADATA_KEY = 1000; function buildPipeline(sourceType) { return [ @@ -87,12 +86,26 @@ function buildPipeline(sourceType) { }, }, + { + $addFields: { + metaKeyId: "$_id", // metaKeyId is needed for the $merge stage for deduplication, but will be removed in the final updateMany + generatedId: { + $function: { + body: "function() { return UUID().toString().replace('UUID(\"', '').replace('\")', ''); }", + args: [], + lang: "js", + }, + }, + }, + }, + // Stage 8: Final projection. // userGroupCounts: [{k,v}] array → plain object (stored as Map in Mongoose). // usageCount: union all per-group datasetId sets, count distinct IDs. { $project: { - _id: 1, + _id: "$generatedId", + metaKeyId: 1, key: 1, sourceType: { $literal: sourceType }, humanReadableName: 1, @@ -120,7 +133,7 @@ function buildPipeline(sourceType) { { $merge: { into: "MetadataKeys", - on: "_id", + on: "metaKeyId", whenMatched: [ { $replaceWith: { @@ -128,94 +141,71 @@ function buildPipeline(sourceType) { "$$new", { // Preserve original audit fields + _id: "$_id", createdAt: { $ifNull: ["$createdAt", "$$new.createdAt"] }, createdBy: { $ifNull: ["$createdBy", "$$new.createdBy"] }, updatedBy: { $literal: "migration" }, updatedAt: { $toDate: "$$NOW" }, - - isPublished: { - $cond: { - if: { - $gt: ["$usageCount", MAX_USER_GROUPS_PER_METADATA_KEY], - }, - then: true, - else: { $or: ["$isPublished", "$$new.isPublished"] }, - }, - }, - + isPublished: { $or: ["$isPublished", "$$new.isPublished"] }, userGroups: { - $cond: { - if: { - $gt: ["$usageCount", MAX_USER_GROUPS_PER_METADATA_KEY], - }, - then: "$userGroups", - else: { $setUnion: ["$userGroups", "$$new.userGroups"] }, - }, + $setUnion: ["$userGroups", "$$new.userGroups"], }, // Additively merge userGroupCounts — sum counts per group // across both the existing and incoming documents. userGroupCounts: { - $cond: { - if: { - $gt: ["$usageCount", MAX_USER_GROUPS_PER_METADATA_KEY], - }, - then: "$userGroupCounts", - else: { - $arrayToObject: { - $map: { - input: { - $setUnion: [ - { - $map: { - input: { - $objectToArray: "$userGroupCounts", - }, - as: "e", - in: "$$e.k", - }, + $arrayToObject: { + $map: { + input: { + $setUnion: [ + { + $map: { + input: { + $objectToArray: "$userGroupCounts", }, - { - $map: { - input: { - $objectToArray: "$$new.userGroupCounts", - }, - as: "e", - in: "$$e.k", - }, + as: "e", + in: "$$e.k", + }, + }, + { + $map: { + input: { + $objectToArray: "$$new.userGroupCounts", }, - ], + as: "e", + in: "$$e.k", + }, }, - as: "group", - in: { - k: "$$group", - v: { - $add: [ + ], + }, + as: "group", + in: { + k: "$$group", + v: { + $add: [ + { + $ifNull: [ { - $ifNull: [ - { - $getField: { - field: "$$group", - input: "$userGroupCounts", - }, - }, - 0, - ], + $getField: { + field: "$$group", + input: "$userGroupCounts", + }, }, + 0, + ], + }, + { + $ifNull: [ { - $ifNull: [ - { - $getField: { - field: "$$group", - input: "$$new.userGroupCounts", - }, - }, - 0, - ], + $getField: { + field: "$$group", + input: "$$new.userGroupCounts", + }, }, + 0, ], }, - }, + ], }, }, }, @@ -242,6 +232,11 @@ module.exports = { // Wipe MetadataKeys collection first to ensure a clean state const deleted = await db.collection("MetadataKeys").deleteMany({}); + + await db + .collection("MetadataKeys") + .createIndex({ metaKeyId: 1 }, { unique: true }); + console.log( `[${elapsed()}] Cleared ${deleted.deletedCount} existing MetadataKeys`, ); @@ -305,6 +300,11 @@ module.exports = { console.log(`[${elapsed()}] ✅ ${collection} done`); } + await db.collection("MetadataKeys").dropIndex("metaKeyId_1"); + await db + .collection("MetadataKeys") + .updateMany({}, [{ $set: { id: "$_id" } }, { $unset: ["metaKeyId"] }]); + const result = await db.collection("MetadataKeys").countDocuments(); console.log( `[${elapsed()}] Migration completed — Total MetadataKeys: ${result.toLocaleString()}`, From 0aeb977643029f75fe48189fce689e506f1eee8f Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 4 May 2026 17:17:02 +0200 Subject: [PATCH 23/29] Replace concat _id with compound filter (sourceType + key + humanReadableName) --- .../metadatakeys.service.spec.ts | 104 ++++++++---------- src/metadata-keys/metadatakeys.service.ts | 92 ++++++++-------- 2 files changed, 92 insertions(+), 104 deletions(-) diff --git a/src/metadata-keys/metadatakeys.service.spec.ts b/src/metadata-keys/metadatakeys.service.spec.ts index 7f66ca359..e2ac659ab 100644 --- a/src/metadata-keys/metadatakeys.service.spec.ts +++ b/src/metadata-keys/metadatakeys.service.spec.ts @@ -7,7 +7,7 @@ const modelMock = { aggregate: jest .fn() .mockReturnValue({ exec: jest.fn().mockResolvedValue([]) }), - bulkWrite: jest.fn().mockResolvedValue({}), + findOneAndUpdate: jest.fn().mockResolvedValue({}), updateMany: jest.fn().mockResolvedValue({}), deleteMany: jest.fn().mockResolvedValue({}), }; @@ -122,89 +122,71 @@ describe("MetadataKeysService", () => { it("does nothing when metadata is empty", async () => { await service.insertManyFromSource({ ...BASE_DOC, metadata: {} }); - expect(modelMock.bulkWrite).not.toHaveBeenCalled(); + expect(modelMock.findOneAndUpdate).not.toHaveBeenCalled(); }); - it("calls bulkWrite with one updateOne per metadata key", async () => { + it("calls findOneAndUpdate once per metadata key", async () => { await service.insertManyFromSource(BASE_DOC); - - expect(modelMock.bulkWrite).toHaveBeenCalledTimes(1); - const [ops] = modelMock.bulkWrite.mock.calls[0]; - expect(ops).toHaveLength(2); - expect(ops.every((op: object) => "updateOne" in op)).toBe(true); - }); - - it("builds correct _id from sourceType + key + humanReadableName", async () => { - await service.insertManyFromSource(BASE_DOC); - - const [ops] = modelMock.bulkWrite.mock.calls[0]; - const ids = ops.map( - (op: { updateOne: { filter: { _id: string } } }) => - op.updateOne.filter._id, - ); - - expect(ids).toContain("Dataset_temperature_Temperature"); - expect(ids).toContain("Dataset_pressure_"); // no human_name + expect(modelMock.findOneAndUpdate).toHaveBeenCalledTimes(2); }); - it("sets upsert: true on every operation", async () => { + it("builds correct filter from sourceType + key + humanReadableName", async () => { await service.insertManyFromSource(BASE_DOC); - - const [ops] = modelMock.bulkWrite.mock.calls[0]; - expect( - ops.every( - (op: { updateOne: { upsert: boolean } }) => - op.updateOne.upsert === true, - ), - ).toBe(true); + const calls = modelMock.findOneAndUpdate.mock.calls; + const filters = calls.map((call: unknown[]) => call[0]); + expect(filters).toContainEqual({ + sourceType: "Dataset", + key: "temperature", + humanReadableName: "Temperature", + }); + expect(filters).toContainEqual({ + sourceType: "Dataset", + key: "pressure", + humanReadableName: "", + }); }); it("increments usageCount and per-group counts", async () => { await service.insertManyFromSource(BASE_DOC); - - const [ops] = modelMock.bulkWrite.mock.calls[0]; - const { $inc } = ops[0].updateOne.update; - - expect($inc.usageCount).toBe(1); - expect($inc["userGroupCounts.group-1"]).toBe(1); - expect($inc["userGroupCounts.group-2"]).toBe(1); + const [, update] = modelMock.findOneAndUpdate.mock.calls[0]; + expect(update.$inc.usageCount).toBe(1); + expect(update.$inc["userGroupCounts.group-1"]).toBe(1); + expect(update.$inc["userGroupCounts.group-2"]).toBe(1); }); it("adds userGroups via $addToSet", async () => { await service.insertManyFromSource(BASE_DOC); - - const [ops] = modelMock.bulkWrite.mock.calls[0]; - const { $addToSet } = ops[0].updateOne.update; - - expect($addToSet.userGroups.$each).toEqual(["group-1", "group-2"]); + const [, update] = modelMock.findOneAndUpdate.mock.calls[0]; + expect(update.$addToSet.userGroups.$each).toEqual(["group-1", "group-2"]); }); it("sets isPublished when dataset is published", async () => { await service.insertManyFromSource({ ...BASE_DOC, isPublished: true }); - - const [ops] = modelMock.bulkWrite.mock.calls[0]; - expect(ops[0].updateOne.update.$set.isPublished).toBe(true); + const [, update] = modelMock.findOneAndUpdate.mock.calls[0]; + expect(update.$set.isPublished).toBe(true); }); it("does not set isPublished when dataset is not published", async () => { await service.insertManyFromSource({ ...BASE_DOC, isPublished: false }); - - const [ops] = modelMock.bulkWrite.mock.calls[0]; - expect(ops[0].updateOne.update.$set.isPublished).toBeUndefined(); + const [, update] = modelMock.findOneAndUpdate.mock.calls[0]; + expect(update.$set.isPublished).toBeUndefined(); }); it("uses $setOnInsert for immutable fields", async () => { await service.insertManyFromSource(BASE_DOC); - - const [ops] = modelMock.bulkWrite.mock.calls[0]; - const { $setOnInsert } = ops[0].updateOne.update; - - expect($setOnInsert).toMatchObject({ + const [, update] = modelMock.findOneAndUpdate.mock.calls[0]; + expect(update.$setOnInsert).toMatchObject({ key: expect.any(String), sourceType: "Dataset", humanReadableName: expect.any(String), }); }); + + it("sets upsert: true", async () => { + await service.insertManyFromSource(BASE_DOC); + const [, , options] = modelMock.findOneAndUpdate.mock.calls[0]; + expect(options.upsert).toBe(true); + }); }); // ------------------------------------------------------------------------- @@ -244,19 +226,27 @@ describe("MetadataKeysService", () => { expect(firstUpdate.$inc["userGroupCounts.group-2"]).toBe(-1); }); - it("targets correct _ids based on metadata keys and humanReadableName", async () => { + it("targets correct filter based on metadata keys and humanReadableName", async () => { await service.deleteMany(BASE_DOC); const [firstFilter] = modelMock.updateMany.mock.calls[0]; - expect(firstFilter._id.$in).toContain("Dataset_temperature_Temperature"); - expect(firstFilter._id.$in).toContain("Dataset_pressure_"); + expect(firstFilter.$or).toContainEqual({ + sourceType: "Dataset", + key: "temperature", + humanReadableName: "Temperature", + }); + expect(firstFilter.$or).toContainEqual({ + sourceType: "Dataset", + key: "pressure", + humanReadableName: "", + }); }); it("deletes documents where usageCount <= 0", async () => { await service.deleteMany(BASE_DOC); const [deleteFilter] = modelMock.deleteMany.mock.calls[0]; - expect(deleteFilter.usageCount).toEqual({ $lte: 0 }); + expect(deleteFilter.$and[1].usageCount).toEqual({ $lte: 0 }); }); it("recompute stage uses $set with $objectToArray on userGroupCounts", async () => { diff --git a/src/metadata-keys/metadatakeys.service.ts b/src/metadata-keys/metadatakeys.service.ts index 1b6e097c8..bcb1978ab 100644 --- a/src/metadata-keys/metadatakeys.service.ts +++ b/src/metadata-keys/metadatakeys.service.ts @@ -28,6 +28,14 @@ export type MetadataSourceDoc = { const RECOMPUTE_USER_GROUPS_STAGE = [ { $set: { + userGroupCounts: { + $arrayToObject: { + $filter: { + input: { $objectToArray: "$userGroupCounts" }, + cond: { $gt: ["$$this.v", 0] }, + }, + }, + }, userGroups: { $map: { input: { @@ -110,39 +118,28 @@ export class MetadataKeysService { const ops = Object.entries(metadata).map(([key, entry]) => { const humanReadableName = (entry as ScientificMetadataEntry).human_name ?? ""; - const id = this.buildId(sourceType, key, humanReadableName); - - return { - updateOne: { - filter: { _id: id }, - update: { - $setOnInsert: addCreatedByFields( - { - _id: id as unknown as MetadataKeyDocument["_id"], - id, - key, - sourceType, - humanReadableName, - }, - "system", - ), - $set: { - // Only ever set to true inline. The false case is handled - // by the reconciliation cronjob since it transitions rarely. - ...(isPublished && { isPublished: true }), - }, - $addToSet: { userGroups: { $each: userGroups } }, - $inc: { - usageCount: 1, - ...this.groupCountDeltas(userGroups, 1), - }, + + return this.metadataKeyModel.findOneAndUpdate( + { sourceType, key, humanReadableName }, + { + $setOnInsert: addCreatedByFields( + { key, sourceType, humanReadableName }, + "system", + ), + $set: { + ...(isPublished && { isPublished: true }), + }, + $addToSet: { userGroups: { $each: userGroups } }, + $inc: { + usageCount: 1, + ...this.groupCountDeltas(userGroups, 1), }, - upsert: true, }, - }; + { upsert: true }, + ); }); - await this.metadataKeyModel.bulkWrite(ops); + await Promise.all(ops); Logger.log( `Upserted MetadataKeys for ${sourceType}: ${Object.keys(metadata).join(", ")}`, @@ -158,34 +155,34 @@ export class MetadataKeysService { * (drops any group whose count reached zero) * 3. Delete entries no longer referenced by any dataset * - * usageCount is the authoritative deletion signal because a dataset - * with no userGroups and isPublished: false would be invisible to - * both userGroupCounts and isPublished checks. + */ async deleteMany(doc: MetadataSourceDoc): Promise { if (isEmpty(doc.metadata)) return; const { sourceType, userGroups, metadata } = doc; const keys = Object.keys(metadata); - const ids = keys.map((key) => { + const filters = keys.map((key) => { const humanReadableName = (metadata[key] as ScientificMetadataEntry)?.human_name ?? ""; - return this.buildId(sourceType, key, humanReadableName); + return this.buildFilter(sourceType, key, humanReadableName); }); - const filter = { _id: { $in: ids } }; + const queryFilter = { $or: filters }; - await this.metadataKeyModel.updateMany(filter, { + await this.metadataKeyModel.updateMany(queryFilter, { $inc: { usageCount: -1, ...this.groupCountDeltas(userGroups, -1), }, }); - await this.metadataKeyModel.updateMany(filter, RECOMPUTE_USER_GROUPS_STAGE); + await this.metadataKeyModel.updateMany( + queryFilter, + RECOMPUTE_USER_GROUPS_STAGE, + ); await this.metadataKeyModel.deleteMany({ - ...filter, - usageCount: { $lte: 0 }, + $and: [queryFilter, { usageCount: { $lte: 0 } }], }); Logger.log( @@ -236,12 +233,12 @@ export class MetadataKeysService { ]); } - private buildId( + private buildFilter( sourceType: string, key: string, humanReadableName: string, - ): string { - return `${sourceType}_${key}_${humanReadableName}`; + ): FilterQuery { + return { sourceType, key, humanReadableName }; } /** @@ -307,14 +304,15 @@ export class MetadataKeysService { humanNameUnchangedKeys.length > 0 && (hasGroupChanges || publishedFlippedOn) ) { - const ids = humanNameUnchangedKeys.map((k) => { + const filters = humanNameUnchangedKeys.map((k) => { const humanReadableName = (newDoc.metadata[k] as ScientificMetadataEntry)?.human_name ?? ""; - return this.buildId(sourceType, k, humanReadableName); + return this.buildFilter(sourceType, k, humanReadableName); }); - const filter = { _id: { $in: ids } }; - await this.metadataKeyModel.updateMany(filter, { + const queryFilter = { $or: filters }; + + await this.metadataKeyModel.updateMany(queryFilter, { ...(addedGroups.length > 0 && { $addToSet: { userGroups: { $each: addedGroups } }, }), @@ -329,7 +327,7 @@ export class MetadataKeysService { // Recompute userGroups to drop groups whose count reached zero if (removedGroups.length > 0) { await this.metadataKeyModel.updateMany( - filter, + queryFilter, RECOMPUTE_USER_GROUPS_STAGE, ); } From 3b10fdc4d0b42a2bfd2312fcf120e2b5970ceae7 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 11 May 2026 12:39:28 +0200 Subject: [PATCH 24/29] improve migration script and documentation --- ...aset-scientificMetadata-to-metadatakeys.md | 458 +++++++++++++----- ...aset-scientificMetadata-to-metadatakeys.js | 120 +---- 2 files changed, 331 insertions(+), 247 deletions(-) diff --git a/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md b/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md index c753c9cc1..250483db1 100644 --- a/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md +++ b/docs/developer-guide/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.md @@ -38,12 +38,13 @@ Each `MetadataKey` document tracks: --- -## Output shape +## MetadataKey shape ```js // MetadataKey document { - _id: "Dataset_temperature_Temperature", // ${sourceType}_${key}_${humanReadableName} + _id: "550e8400-e29b-41d4-a716-446655440000", + id:"550e8400-e29b-41d4-a716-446655440000", key: "temperature", humanReadableName: "Temperature", sourceType: "Dataset", @@ -60,189 +61,365 @@ Each `MetadataKey` document tracks: ## Migration Pipeline walkthrough -The migration pipeline uses two datasets as a running example throughout: - -| | ownerGroup | accessGroups | isPublished | scientificMetadata keys | -| --------- | ---------- | ------------ | ----------- | ----------------------- | -| Dataset A | group-1 | [group-2] | true | temperature, pressure | -| Dataset B | group-1 | [] | false | temperature | +It builds a MetadataKeys collection by extracting and aggregating scientific metadata keys from datasets. Each document in MetadataKeys represents one unique metadata key, enriched with access group membership, usage counts, and publication status. --- -### Stage 1 — Flatten scientificMetadata +### Stage 1 — Flatten scientificMetadata into an array + +```js +{ + $project: { + datasetId: "$_id", + ownerGroup: 1, + accessGroups: 1, + isPublished: 1, + metaArr: { $objectToArray: "$scientificMetadata" }, + }, +} +``` + +**What it does:** Converts the scientificMetadata object into an array of {k, v} pairs so it can be unwound in the next stage. Preserves \_id as datasetId for later use in usage counting. -Expose `_id` as `datasetId` and convert `scientificMetadata` from an object to an array of `{k, v}` pairs using `$objectToArray`. +**Input** ```js -// Input { - _id: "uuid-A", - ownerGroup: "group-1", - accessGroups: ["group-2"], - isPublished: true, - scientificMetadata: { - temperature: { value: 100, unit: "C", human_name: "Temperature" }, - pressure: { value: 1, unit: "bar" }, + "_id": "ds1", + "ownerGroup": "groupA", + "accessGroups": ["groupB"], + "isPublished": false, + "scientificMetadata": { + "temperature": { "human_name": "Temperature", "value": 100 }, + "pressure": { "human_name": "Pressure", "value": 200 } } } +``` -// Output +**Output** + +```js { - datasetId: "uuid-A", - ownerGroup: "group-1", - accessGroups: ["group-2"], - isPublished: true, - metaArr: [ - { k: "temperature", v: { value: 100, unit: "C", human_name: "Temperature" } }, - { k: "pressure", v: { value: 1, unit: "bar" } }, + "datasetId": "ds1", + "ownerGroup": "groupA", + "accessGroups": ["groupB"], + "isPublished": false, + "metaArr": [ + { "k": "temperature", "v": { "human_name": "Temperature", "value": 100 } }, + { "k": "pressure", "v": { "human_name": "Pressure", "value": 200 } } ] } ``` --- -### Stage 2 — Unwind metaArr +### Stage 2 — One document per metadata key -One document per `(dataset, metadata key)`. +```js +{ + $unwind: "$metaArr"; +} +``` + +**What it does:** Produces one document per metadata key entry. A dataset with N metadata keys becomes N documents. +**Input (from stage1)** + +```js +{ + "datasetId": "ds1", + "metaArr": [ + { "k": "temperature", "v": { "human_name": "Temperature" } }, + { "k": "pressure", "v": { "human_name": "Pressure" } } + ] +} ``` -Input: 1 document (Dataset A) with metaArr of length 2 -Output: 2 documents -{ datasetId: "uuid-A", ..., metaArr: { k: "temperature", v: { human_name: "Temperature", ... } } } -{ datasetId: "uuid-A", ..., metaArr: { k: "pressure", v: { ... } } } +**Output** + +```js +{ "datasetId": "ds1", "metaArr": { "k": "temperature", "v": { "human_name": "Temperature" } } } +{ "datasetId": "ds1", "metaArr": { "k": "pressure", "v": { "human_name": "Pressure" } } } ``` --- -### Stage 3 — Shape each (dataset, key) document - -Extract `key`, `humanReadableName`, and compute `userGroups` as the union of `ownerGroup` and `accessGroups`. +### Stage 3 — Shape each document (datasetId+key) with HRM and userGroups ```js -// Input (one of the two documents from stage 3) -{ datasetId: "uuid-A", ownerGroup: "group-1", accessGroups: ["group-2"], isPublished: true, metaArr: { k: "temperature", v: { human_name: "Temperature" } } } +{ + $project: { + datasetId: 1, + key: "$metaArr.k", + isPublished: 1, + humanReadableName: { $ifNull: ["$metaArr.v.human_name", ""] }, + userGroups: { + $setUnion: [["$ownerGroup"], { $ifNull: ["$accessGroups", []] }], + }, + }, +} +``` -// Output (all documents after both datasets are processed) -{ datasetId: "uuid-A", key: "temperature", isPublished: true, humanReadableName: "Temperature", userGroups: ["group-1", "group-2"] } -{ datasetId: "uuid-A", key: "pressure", isPublished: true, humanReadableName: "", userGroups: ["group-1", "group-2"] } -{ datasetId: "uuid-B", key: "temperature", isPublished: false, humanReadableName: "Temperature", userGroups: ["group-1"] } +**What it does:** Extracts the key name and human-readable name. Computes userGroups as the union of ownerGroup and accessGroups — every group that has access to this dataset. + +**Input (from stage2)** + +```js +{ + "datasetId": "ds1", + "ownerGroup": "groupA", + "accessGroups": ["groupB"], + "isPublished": false, + "metaArr": { "k": "temperature", "v": { "human_name": "Temperature" } } +} ``` -> `ownerGroup` is a mandatory field, so no null fallback is needed. `accessGroups` is optional so it falls back to `[]`. +**output** + +```js +{ + "datasetId": "ds1", + "key": "temperature", + "humanReadableName": "Temperature", + "isPublished": false, + "userGroups": ["groupA", "groupB"] +} +``` --- -### Stage 4 — Unwind userGroups +### Stage 4 — One document per (dataset+key+group) + +```js +{ + $unwind: { + path: "$userGroups", + }, +} +``` -One document per `(dataset, key, group)`. This is the pivot that makes per-group counting possible. +**What it does:** Split userGroups so each group gets its own document. This allows grouping by (key, group) in Stage 6. -`preserveNullAndEmptyArrays: true` keeps datasets with no groups so `usageCount` stays accurate. +**Input (from stage3)** +```js +{ + "datasetId": "ds1", + "key": "temperature", + "userGroups": ["groupA", "groupB"] +} ``` -Input -{ datasetId: "uuid-A", key: "temperature", humanReadableName: "Temperature", isPublished: true, userGroups: ["group-1", "group-2"] } -Output -{ datasetId: "uuid-A", key: "temperature", humanReadableName: "Temperature", isPublished: true, userGroups: "group-1" } -{ datasetId: "uuid-A", key: "temperature", humanReadableName: "Temperature", isPublished: true, userGroups: "group-2" } -{ datasetId: "uuid-A", key: "pressure", humanReadableName: "", isPublished: true, userGroups: "group-1" } -{ datasetId: "uuid-A", key: "pressure", humanReadableName: "", isPublished: true, userGroups: "group-2" } -{ datasetId: "uuid-B", key: "temperature", humanReadableName: "Temperature", isPublished: false, userGroups: "group-1" } +**output** + +```js +{ "datasetId": "ds1", "key": "temperature", "userGroups": "groupA" } +{ "datasetId": "ds1", "key": "temperature", "userGroups": "groupB" } ``` --- -### Stage 5 — Filter null userGroups +### Stage 5 — Group by (metaKeyId, group) -Drop documents where `userGroups` is `null`. This only occurs when `preserveNullAndEmptyArrays` retains a document from a dataset that had no groups at all. +```js +{ + $group: { + _id: { + metaKeyId: { $concat: [`${sourceType}_`, "$key", "_", "$humanReadableName"] }, + group: "$userGroups", + }, + key: { $first: "$key" }, + humanReadableName: { $first: "$humanReadableName" }, + isPublished: { $max: "$isPublished" }, + groupCount: { $sum: 1 }, + datasetIds: { $addToSet: "$datasetId" }, + }, +} +``` +**What it does:** Groups by (metadata key, group) pair. Computes: + +- `metaKeyId` is a stable, deterministic identifier derived from `${sourceType}_${key}_${humanReadableName}` used as the merge key in Stage 9 to prevent duplicate documents across pipeline runs. +- `groupCount` indicates how many datasets with this group use this key +- `datasetIds` includes distinct dataset IDs for this group, used later to count unique datasets across all groups without double-counting + +**Input (from stage4)** + +```js +{ "datasetId": "ds1", "key": "temperature", "humanReadableName": "Temperature", "userGroups": "groupA", "isPublished": false } +{ "datasetId": "ds2", "key": "temperature", "humanReadableName": "Temperature", "userGroups": "groupA", "isPublished": true } +{ "datasetId": "ds1", "key": "temperature", "humanReadableName": "Temperature", "userGroups": "groupB", "isPublished": false } ``` -Input: stream from stage 5, some may have userGroups: null -Output: only documents where userGroups is a real group name + +**Output** + +```js +{ + "_id": { "metaKeyId": "dataset_temperature_Temperature", "group": "groupA" }, + "key": "temperature", + "humanReadableName": "Temperature", + "isPublished": true, + "groupCount": 2, + "datasetIds": ["ds1", "ds2"] +} +{ + "_id": { "metaKeyId": "dataset_temperature_Temperature", "group": "groupB" }, + "key": "temperature", + "humanReadableName": "Temperature", + "isPublished": false, + "groupCount": 1, + "datasetIds": ["ds1"] +} ``` --- -### Stage 6 — Group by (metaKeyId, group) - -Each bucket is one unique `(key, humanReadableName, group)` combination. -`groupCount` = how many datasets with this group reference this key. +### Stage 6 — Group by metaKeyId +```js +{ + $group: { + _id: "$_id.metaKeyId", + key: { $first: "$key" }, + humanReadableName: { $first: "$humanReadableName" }, + isPublished: { $max: "$isPublished" }, + userGroups: { $push: "$_id.group" }, + userGroupCountsArr: { $push: { k: "$_id.group", v: "$groupCount" } }, + datasetIdSets: { $push: "$datasetIds" }, + }, +} ``` -Input (5 documents from stage 6) -{ datasetId: "uuid-A", key: "temperature", humanReadableName: "Temperature", isPublished: true, userGroups: "group-1" } -{ datasetId: "uuid-A", key: "temperature", humanReadableName: "Temperature", isPublished: true, userGroups: "group-2" } -{ datasetId: "uuid-A", key: "pressure", humanReadableName: "", isPublished: true, userGroups: "group-1" } -{ datasetId: "uuid-A", key: "pressure", humanReadableName: "", isPublished: true, userGroups: "group-2" } -{ datasetId: "uuid-B", key: "temperature", humanReadableName: "Temperature", isPublished: false, userGroups: "group-1" } -Output (4 buckets — one per unique metaKeyId + group) -{ _id: { metaKeyId: "Dataset_temperature_Temperature", group: "group-1" }, key: "temperature", humanReadableName: "Temperature", isPublished: true, groupCount: 2 } -{ _id: { metaKeyId: "Dataset_temperature_Temperature", group: "group-2" }, key: "temperature", humanReadableName: "Temperature", isPublished: true, groupCount: 1 } -{ _id: { metaKeyId: "Dataset_pressure_", group: "group-1" }, key: "pressure", humanReadableName: "", isPublished: true, groupCount: 1 } -{ _id: { metaKeyId: "Dataset_pressure_", group: "group-2" }, key: "pressure", humanReadableName: "", isPublished: true, groupCount: 1 } +**What it does:** Reassembles one document per metadata key by collecting all per-group data. datasetIdSets is a list of per-group dataset ID sets — merged in Stage 8 to compute total unique dataset count. + +**Input (from stage5)** + +```js +{ "_id": { "metaKeyId": "dataset_temperature_Temperature", "group": "groupA" }, "groupCount": 2, "datasetIds": ["ds1", "ds2"] } +{ "_id": { "metaKeyId": "dataset_temperature_Temperature", "group": "groupB" }, "groupCount": 1, "datasetIds": ["ds1"] } ``` -> `isPublished: $max` means the field is `true` if **any** contributing dataset is published. +**Output** + +```js +{ + "_id": "dataset_temperature_Temperature", + "key": "temperature", + "humanReadableName": "Temperature", + "isPublished": true, + "userGroups": ["groupA", "groupB"], + "userGroupCountsArr": [ + { "k": "groupA", "v": 2 }, + { "k": "groupB", "v": 1 } + ], + "datasetIdSets": [["ds1", "ds2"], ["ds1"]] +} +``` --- -### Stage 7 — Group by metaKeyId +### Stage 7 — Add generated UUID -Reassemble one document per metadata key by collecting the per-group buckets from stage 7. +```js +{ + $addFields: { + metaKeyId: "$_id", + generatedId: { + $function: { + body: "function() { return UUID().toString().replace('UUID(\"', '').replace('\")', ''); }", + args: [], + lang: "js", + }, + }, + }, +} +``` +**What it does:** Saves `_id` which at this point is `${sourceType}_${key}_${humanReadableName}` as metaKeyId before it gets replaced. Generates a UUID for \_id to support future document splitting when a document approaches MongoDB's 16MB size limit, while metaKeyId remains the stable merge key for Stage 8. + +**Input (from stage7)** + +```json +{ "_id": "dataset_temperature_Temperature", ... } ``` -Input (4 buckets from stage 7) -{ _id: { metaKeyId: "Dataset_temperature_Temperature", group: "group-1" }, groupCount: 2, ... } -{ _id: { metaKeyId: "Dataset_temperature_Temperature", group: "group-2" }, groupCount: 1, ... } -{ _id: { metaKeyId: "Dataset_pressure_", group: "group-1" }, groupCount: 1, ... } -{ _id: { metaKeyId: "Dataset_pressure_", group: "group-2" }, groupCount: 1, ... } -Output (2 documents — one per unique key) -{ - _id: "Dataset_temperature_Temperature", - key: "temperature", humanReadableName: "Temperature", isPublished: true, - userGroups: ["group-1", "group-2"], - userGroupCountsArr: [{ k: "group-1", v: 2 }, { k: "group-2", v: 1 }], - datasetIdSets: [["uuid-A", "uuid-B"], ["uuid-A"]] -} +**Output** + +```json { - _id: "Dataset_pressure_", - key: "pressure", humanReadableName: "", isPublished: true, - userGroups: ["group-1", "group-2"], - userGroupCountsArr: [{ k: "group-1", v: 1 }, { k: "group-2", v: 1 }], - datasetIdSets: [["uuid-A"], ["uuid-A"]] + "_id": "dataset_temperature_Temperature", + "metaKeyId": "dataset_temperature_Temperature", + "generatedId": "550e8400-e29b-41d4-a716-446655440000", + ... } ``` --- -### Stage 8 — Final projection +### Stage 8 — Project final document shape + +```js +{ + $project: { + _id: "$generatedId", + metaKeyId: 1, + key: 1, + sourceType: { $literal: sourceType }, + humanReadableName: 1, + isPublished: 1, + userGroups: 1, + userGroupCounts: { $arrayToObject: "$userGroupCountsArr" }, + usageCount: { + $size: { + $reduce: { + input: "$datasetIdSets", + initialValue: [], + in: { $setUnion: ["$$value", "$$this"] }, + }, + }, + }, + createdBy: { $literal: "migration" }, + createdAt: { $toDate: "$$NOW" }, + }, +} +``` + +**What it does:** Produces the final document shape for MetadataKeys. Converts userGroupCountsArr to an object map. Computes usageCount by merging all per-group datasetIdSets into a single set and counting — this avoids double-counting datasets that belong to multiple groups. -- `userGroupCounts`: converts `[{k, v}]` array to a plain object (Mongoose stores this as a `Map`) -- `usageCount`: unions all per-group `datasetId` sets and counts distinct IDs +**Why set union for usageCount:** +```js +datasetIds: ["ds1", "ds2"]; //groupA +datasetIds: ["ds1"]; // groupB +union: ["ds1", "ds2"]; // usageCount = 2 (not 3) ``` -// usageCount for temperature: -// datasetIdSets = [["uuid-A", "uuid-B"], ["uuid-A"]] -// union = ["uuid-A", "uuid-B"] -// count = 2 + +**Input (from stage7)** + +```js +{ + "generatedId": "550e8400-e29b-41d4-a716-446655440000", + "metaKeyId": "dataset_temperature_Temperature", + "userGroupCountsArr": [{ "k": "groupA", "v": 2 }, { "k": "groupB", "v": 1 }], + "datasetIdSets": [["ds1", "ds2"], ["ds1"]] +} ``` +**Output** + ```js -// Output (final document written to MetadataKeys) { - _id: "Dataset_temperature_Temperature", - key: "temperature", - sourceType: "Dataset", - humanReadableName: "Temperature", - isPublished: true, - userGroups: ["group-1", "group-2"], - userGroupCounts: { "group-1": 2, "group-2": 1 }, - usageCount: 2, - createdBy: "migration", - createdAt: ISODate("...") + "_id": "550e8400-e29b-41d4-a716-446655440000", + "metaKeyId": "dataset_temperature_Temperature", + "key": "temperature", + "sourceType": "dataset", + "humanReadableName": "Temperature", + "isPublished": true, + "userGroups": ["groupA", "groupB"], + "userGroupCounts": { "groupA": 2, "groupB": 1 }, + "usageCount": 2, + "createdBy": "migration", + "createdAt": "2026-05-07T00:00:00.000Z" } ``` @@ -250,31 +427,54 @@ Output (2 documents — one per unique key) ### Stage 9 — Merge into MetadataKeys -- `whenNotMatched: insert` — new documents are inserted as-is -- `whenMatched` — additively merges counts when two `SOURCE_COLLECTIONS` produce the same `_id` - -> This is not triggered today since `sourceType` is part of `_id`, making collisions between collections impossible. It is kept correct for when `Proposal`, `Sample`, or other collections are added as sources. +```js +{ + $merge: { + into: "MetadataKeys", + on: "metaKeyId", + whenMatched: [ + { + $replaceWith: { + $mergeObjects: [ + "$$new", + { _id: "$_id" } + ] + } + } + ], + whenNotMatched: "insert", + }, +} +``` -**Group cap behaviour** +**What it does:** Upserts each document into `MetadataKeys` using `metaKeyId` as the match key. On match, replaces the existing document entirely with the incoming one, preserving only `_id` since MongoDB does not allow changing it. -When merging, if the existing document's `usageCount` exceeds `MAX_USER_GROUPS_PER_METADATA_KEY`: +**Input (from Stage 9)** -- `userGroups` is frozen — no new groups are added -- `userGroupCounts` is frozen — no new counts are merged -- `isPublished` is forced to `true` — key remains publicly discoverable -- `usageCount` continues to increment regardless +```json +{ + "_id": "550e8400-e29b-41d4-a716-446655440000", + "metaKeyId": "dataset_temperature_Temperature", + "userGroups": ["groupA", "groupB"], + "userGroupCounts": { "groupA": 2, "groupB": 1 }, + "usageCount": 2, + "isPublished": true +} +``` -This prevents unbounded document growth while ensuring heavily-used keys remain visible to all users via `isPublished: true`. +**Output (inserted or replaced):** -```js -// whenMatched userGroupCounts example -// existing: { "group-1": 3, "group-2": 1 } -// incoming: { "group-1": 2, "group-3": 5 } -// result: { "group-1": 5, "group-2": 1, "group-3": 5 } +```json +{ + "_id": "550e8400-e29b-41d4-a716-446655440000", + "metaKeyId": "dataset_temperature_Temperature", + "userGroups": ["groupA", "groupB"], + "userGroupCounts": { "groupA": 2, "groupB": 1 }, + "usageCount": 2, + "isPublished": true +} ``` ---- - ## Running the migration ```bash diff --git a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js index 712ca4328..a42626c7c 100644 --- a/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js +++ b/migrations/20260420145401-sync-dataset-scientificMetadata-to-metadatakeys.js @@ -3,8 +3,6 @@ const BATCH_SIZE = 10000; function buildPipeline(sourceType) { return [ - // Stage 1: Flatten scientificMetadata into an array of key/value pairs. - // Preserve _id as datasetId so we can count unique datasets later. { $project: { datasetId: "$_id", @@ -14,12 +12,7 @@ function buildPipeline(sourceType) { metaArr: { $objectToArray: "$scientificMetadata" }, }, }, - - // Stage 2: One document per (dataset, metadata key) { $unwind: "$metaArr" }, - - // Stage 3: Shape each (dataset, key) document. - // userGroups is the union of ownerGroup + accessGroups for this dataset. { $project: { datasetId: 1, @@ -31,27 +24,11 @@ function buildPipeline(sourceType) { }, }, }, - - // Stage 4: One document per (dataset, key, group). { $unwind: { path: "$userGroups", - preserveNullAndEmptyArrays: true, - }, - }, - - // Stage 5: Filter out null and empty-string groups. - // Note: datasets with no valid groups are excluded from usageCount. - // This is acceptable since ownerGroup is a required field in practice. - { - $match: { - userGroups: { $nin: [null, ""] }, }, }, - - // Stage 6: Group by (metaKeyId, group). - // groupCount = how many datasets with this group use this key. - // datasetIds = set of distinct dataset IDs (for accurate usageCount). { $group: { _id: { @@ -67,11 +44,6 @@ function buildPipeline(sourceType) { datasetIds: { $addToSet: "$datasetId" }, }, }, - - // Stage 7: Group by metaKeyId to reassemble one document per metadata key. - // userGroupCountsArr will become the userGroupCounts Map. - // datasetIdSets is a list of per-group dataset ID sets — merged in the - // next stage to compute total unique dataset count (usageCount). { $group: { _id: "$_id.metaKeyId", @@ -85,10 +57,9 @@ function buildPipeline(sourceType) { datasetIdSets: { $push: "$datasetIds" }, }, }, - { $addFields: { - metaKeyId: "$_id", // metaKeyId is needed for the $merge stage for deduplication, but will be removed in the final updateMany + metaKeyId: "$_id", generatedId: { $function: { body: "function() { return UUID().toString().replace('UUID(\"', '').replace('\")', ''); }", @@ -98,10 +69,6 @@ function buildPipeline(sourceType) { }, }, }, - - // Stage 8: Final projection. - // userGroupCounts: [{k,v}] array → plain object (stored as Map in Mongoose). - // usageCount: union all per-group datasetId sets, count distinct IDs. { $project: { _id: "$generatedId", @@ -125,11 +92,6 @@ function buildPipeline(sourceType) { createdAt: { $toDate: "$$NOW" }, }, }, - - // Stage 9: Merge into MetadataKeys. - // whenMatched handles the (future) case where multiple SOURCE_COLLECTIONS - // produce the same _id. Not possible today since sourceType is part of - // the _id, but kept correct for when more collections are added. { $merge: { into: "MetadataKeys", @@ -137,85 +99,7 @@ function buildPipeline(sourceType) { whenMatched: [ { $replaceWith: { - $mergeObjects: [ - "$$new", - { - // Preserve original audit fields - _id: "$_id", - createdAt: { $ifNull: ["$createdAt", "$$new.createdAt"] }, - createdBy: { $ifNull: ["$createdBy", "$$new.createdBy"] }, - updatedBy: { $literal: "migration" }, - updatedAt: { $toDate: "$$NOW" }, - isPublished: { $or: ["$isPublished", "$$new.isPublished"] }, - userGroups: { - $setUnion: ["$userGroups", "$$new.userGroups"], - }, - - // Additively merge userGroupCounts — sum counts per group - // across both the existing and incoming documents. - userGroupCounts: { - $arrayToObject: { - $map: { - input: { - $setUnion: [ - { - $map: { - input: { - $objectToArray: "$userGroupCounts", - }, - as: "e", - in: "$$e.k", - }, - }, - { - $map: { - input: { - $objectToArray: "$$new.userGroupCounts", - }, - as: "e", - in: "$$e.k", - }, - }, - ], - }, - as: "group", - in: { - k: "$$group", - v: { - $add: [ - { - $ifNull: [ - { - $getField: { - field: "$$group", - input: "$userGroupCounts", - }, - }, - 0, - ], - }, - { - $ifNull: [ - { - $getField: { - field: "$$group", - input: "$$new.userGroupCounts", - }, - }, - 0, - ], - }, - ], - }, - }, - }, - }, - }, - - // Add incoming dataset count to existing - usageCount: { $add: ["$usageCount", "$$new.usageCount"] }, - }, - ], + $mergeObjects: ["$$new", { _id: "$_id" }], }, }, ], From 32f95400b1fda14b2d4e9c1fc6dfca8052895ec2 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 11 May 2026 13:41:02 +0200 Subject: [PATCH 25/29] improve metadataKeys doc update logic and add unit tests based on comments --- .../metadatakeys.service.spec.ts | 357 ++++++------------ src/metadata-keys/metadatakeys.service.ts | 299 ++++----------- 2 files changed, 185 insertions(+), 471 deletions(-) diff --git a/src/metadata-keys/metadatakeys.service.spec.ts b/src/metadata-keys/metadatakeys.service.spec.ts index e2ac659ab..fe959638e 100644 --- a/src/metadata-keys/metadatakeys.service.spec.ts +++ b/src/metadata-keys/metadatakeys.service.spec.ts @@ -1,3 +1,4 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ import { getModelToken } from "@nestjs/mongoose"; import { Test, TestingModule } from "@nestjs/testing"; import { MetadataKeysService, MetadataSourceDoc } from "./metadatakeys.service"; @@ -7,7 +8,7 @@ const modelMock = { aggregate: jest .fn() .mockReturnValue({ exec: jest.fn().mockResolvedValue([]) }), - findOneAndUpdate: jest.fn().mockResolvedValue({}), + bulkWrite: jest.fn().mockResolvedValue({}), updateMany: jest.fn().mockResolvedValue({}), deleteMany: jest.fn().mockResolvedValue({}), }; @@ -86,6 +87,44 @@ describe("MetadataKeysService", () => { ); }); + it("applies default sort when no limits provided", async () => { + await service.findAll({}, {}); + + const [pipeline] = modelMock.aggregate.mock.calls[0]; + expect(pipeline).toEqual( + expect.arrayContaining([ + expect.objectContaining({ $sort: { createdAt: -1 } }), + ]), + ); + }); + + it("applies default sort when limits provided without sort", async () => { + await service.findAll({ limits: { limit: 10, skip: 0 } }, {}); + + const [pipeline] = modelMock.aggregate.mock.calls[0]; + expect(pipeline).toEqual( + expect.arrayContaining([ + expect.objectContaining({ $sort: { createdAt: -1 } }), + ]), + ); + }); + + it("uses provided sort when specified", async () => { + await service.findAll( + { limits: { limit: 10, skip: 0, sort: { key: "asc" } } }, + {}, + ); + + const [pipeline] = modelMock.aggregate.mock.calls[0]; + expect(pipeline).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + $sort: expect.objectContaining({ key: 1 }), + }), + ]), + ); + }); + it("omits $project stage when no fields specified", async () => { await service.findAll({}, {}); @@ -94,12 +133,15 @@ describe("MetadataKeysService", () => { expect(hasProject).toBe(false); }); - it("omits $sort stage when no sort specified", async () => { - await service.findAll({ limits: { limit: 10, skip: 0 } }, {}); + it("sort stage comes before skip and limit", async () => { + await service.findAll({}, {}); const [pipeline] = modelMock.aggregate.mock.calls[0]; - const hasSort = pipeline.some((s: object) => "$sort" in s); - expect(hasSort).toBe(false); + const sortIndex = pipeline.findIndex((s: object) => "$sort" in s); + const skipIndex = pipeline.findIndex((s: object) => "$skip" in s); + const limitIndex = pipeline.findIndex((s: object) => "$limit" in s); + expect(sortIndex).toBeLessThan(skipIndex); + expect(sortIndex).toBeLessThan(limitIndex); }); it("returns aggregation results", async () => { @@ -122,18 +164,21 @@ describe("MetadataKeysService", () => { it("does nothing when metadata is empty", async () => { await service.insertManyFromSource({ ...BASE_DOC, metadata: {} }); - expect(modelMock.findOneAndUpdate).not.toHaveBeenCalled(); + expect(modelMock.bulkWrite).not.toHaveBeenCalled(); }); - it("calls findOneAndUpdate once per metadata key", async () => { + it("calls bulkWrite with one op per metadata key", async () => { await service.insertManyFromSource(BASE_DOC); - expect(modelMock.findOneAndUpdate).toHaveBeenCalledTimes(2); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + expect(ops).toHaveLength(2); }); it("builds correct filter from sourceType + key + humanReadableName", async () => { await service.insertManyFromSource(BASE_DOC); - const calls = modelMock.findOneAndUpdate.mock.calls; - const filters = calls.map((call: unknown[]) => call[0]); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + const filters = ops.map((op: any) => op.updateOne.filter); expect(filters).toContainEqual({ sourceType: "Dataset", key: "temperature", @@ -148,7 +193,9 @@ describe("MetadataKeysService", () => { it("increments usageCount and per-group counts", async () => { await service.insertManyFromSource(BASE_DOC); - const [, update] = modelMock.findOneAndUpdate.mock.calls[0]; + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + const { update } = ops[0].updateOne; expect(update.$inc.usageCount).toBe(1); expect(update.$inc["userGroupCounts.group-1"]).toBe(1); expect(update.$inc["userGroupCounts.group-2"]).toBe(1); @@ -156,25 +203,33 @@ describe("MetadataKeysService", () => { it("adds userGroups via $addToSet", async () => { await service.insertManyFromSource(BASE_DOC); - const [, update] = modelMock.findOneAndUpdate.mock.calls[0]; + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + const { update } = ops[0].updateOne; expect(update.$addToSet.userGroups.$each).toEqual(["group-1", "group-2"]); }); it("sets isPublished when dataset is published", async () => { await service.insertManyFromSource({ ...BASE_DOC, isPublished: true }); - const [, update] = modelMock.findOneAndUpdate.mock.calls[0]; - expect(update.$set.isPublished).toBe(true); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + const { update } = ops[0].updateOne; + expect(update.$max.isPublished).toBe(true); }); it("does not set isPublished when dataset is not published", async () => { await service.insertManyFromSource({ ...BASE_DOC, isPublished: false }); - const [, update] = modelMock.findOneAndUpdate.mock.calls[0]; - expect(update.$set.isPublished).toBeUndefined(); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + const { update } = ops[0].updateOne; + expect(update.$max?.isPublished).toBeFalsy(); }); it("uses $setOnInsert for immutable fields", async () => { await service.insertManyFromSource(BASE_DOC); - const [, update] = modelMock.findOneAndUpdate.mock.calls[0]; + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + const { update } = ops[0].updateOne; expect(update.$setOnInsert).toMatchObject({ key: expect.any(String), sourceType: "Dataset", @@ -184,8 +239,9 @@ describe("MetadataKeysService", () => { it("sets upsert: true", async () => { await service.insertManyFromSource(BASE_DOC); - const [, , options] = modelMock.findOneAndUpdate.mock.calls[0]; - expect(options.upsert).toBe(true); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + expect(ops[0].updateOne.upsert).toBe(true); }); }); @@ -197,12 +253,17 @@ describe("MetadataKeysService", () => { it("does nothing when metadata is empty", async () => { await service.deleteMany({ ...BASE_DOC, metadata: {} }); + expect(modelMock.bulkWrite).not.toHaveBeenCalled(); expect(modelMock.updateMany).not.toHaveBeenCalled(); expect(modelMock.deleteMany).not.toHaveBeenCalled(); }); - it("runs three operations in order: decrement → recompute → delete", async () => { + it("runs operations in order: decrement → recompute → delete", async () => { const callOrder: string[] = []; + modelMock.bulkWrite.mockImplementation(() => { + callOrder.push("bulkWrite"); + return Promise.resolve({}); + }); modelMock.updateMany.mockImplementation(() => { callOrder.push("updateMany"); return Promise.resolve({}); @@ -214,16 +275,17 @@ describe("MetadataKeysService", () => { await service.deleteMany(BASE_DOC); - expect(callOrder).toEqual(["updateMany", "updateMany", "deleteMany"]); + expect(callOrder).toEqual(["bulkWrite", "updateMany", "deleteMany"]); }); it("decrements usageCount and per-group counts", async () => { await service.deleteMany(BASE_DOC); - const [, firstUpdate] = modelMock.updateMany.mock.calls[0]; - expect(firstUpdate.$inc.usageCount).toBe(-1); - expect(firstUpdate.$inc["userGroupCounts.group-1"]).toBe(-1); - expect(firstUpdate.$inc["userGroupCounts.group-2"]).toBe(-1); + const [ops] = modelMock.bulkWrite.mock.calls[0]; + const { update } = ops[0].updateOne; + expect(update.$inc.usageCount).toBe(-1); + expect(update.$inc["userGroupCounts.group-1"]).toBe(-1); + expect(update.$inc["userGroupCounts.group-2"]).toBe(-1); }); it("targets correct filter based on metadata keys and humanReadableName", async () => { @@ -249,12 +311,18 @@ describe("MetadataKeysService", () => { expect(deleteFilter.$and[1].usageCount).toEqual({ $lte: 0 }); }); - it("recompute stage uses $set with $objectToArray on userGroupCounts", async () => { + it("recompute stage uses $set with userGroups defined", async () => { await service.deleteMany(BASE_DOC); - // second updateMany call is the RECOMPUTE_USER_GROUPS_STAGE - const [, recomputeStage] = modelMock.updateMany.mock.calls[1]; - expect(recomputeStage[0].$set.userGroups).toBeDefined(); + const [, recomputeStage] = modelMock.updateMany.mock.calls[0]; + expect(recomputeStage[1].$set.userGroups).toBeDefined(); + }); + + it("does not set upsert on decrement ops", async () => { + await service.deleteMany(BASE_DOC); + + const [ops] = modelMock.bulkWrite.mock.calls[0]; + expect(ops[0].updateOne.upsert).toBe(false); }); }); @@ -263,233 +331,50 @@ describe("MetadataKeysService", () => { // ------------------------------------------------------------------------- describe("replaceManyFromSource", () => { - it("calls insertManyFromSource for keys only in newDoc", async () => { - const insertSpy = jest - .spyOn(service, "insertManyFromSource") - .mockResolvedValue(); - - const oldDoc = { - ...BASE_DOC, - metadata: { temperature: { human_name: "Temperature" } }, - }; - const newDoc = { - ...BASE_DOC, - metadata: { - temperature: { human_name: "Temperature" }, - wavelength: {}, - }, - }; - - await service.replaceManyFromSource(oldDoc, newDoc); - - expect(insertSpy).toHaveBeenCalledWith( - expect.objectContaining({ metadata: { wavelength: {} } }), - ); - }); - - it("calls deleteMany for keys only in oldDoc", async () => { - const deleteSpy = jest.spyOn(service, "deleteMany").mockResolvedValue(); - - const oldDoc = { - ...BASE_DOC, - metadata: { temperature: { human_name: "Temperature" }, pressure: {} }, - }; - const newDoc = { - ...BASE_DOC, - metadata: { temperature: { human_name: "Temperature" } }, - }; - - await service.replaceManyFromSource(oldDoc, newDoc); - - expect(deleteSpy).toHaveBeenCalledWith( - expect.objectContaining({ metadata: { pressure: {} } }), - ); - }); - - it("does not call deleteMany or insertManyFromSource for shared unchanged keys", async () => { + it("calls deleteMany with oldDoc then insertManyFromSource with newDoc", async () => { const deleteSpy = jest.spyOn(service, "deleteMany").mockResolvedValue(); const insertSpy = jest .spyOn(service, "insertManyFromSource") .mockResolvedValue(); - const doc = { - ...BASE_DOC, - metadata: { temperature: { human_name: "Temperature" } }, - }; - - await service.replaceManyFromSource(doc, doc); + await service.replaceManyFromSource(BASE_DOC, BASE_DOC); - expect(deleteSpy).not.toHaveBeenCalled(); - expect(insertSpy).not.toHaveBeenCalled(); + expect(deleteSpy).toHaveBeenCalledWith(BASE_DOC); + expect(insertSpy).toHaveBeenCalledWith(BASE_DOC); }); - it("handles all three buckets simultaneously", async () => { - const deleteSpy = jest.spyOn(service, "deleteMany").mockResolvedValue(); - const insertSpy = jest - .spyOn(service, "insertManyFromSource") - .mockResolvedValue(); - - const oldDoc = { - ...BASE_DOC, - metadata: { temperature: { human_name: "Temperature" }, pressure: {} }, - }; - const newDoc = { - ...BASE_DOC, - metadata: { - temperature: { human_name: "Temperature" }, - wavelength: {}, - }, - }; - - await service.replaceManyFromSource(oldDoc, newDoc); - - // pressure removed → deleteMany - expect(deleteSpy).toHaveBeenCalledWith( - expect.objectContaining({ metadata: { pressure: {} } }), - ); - // wavelength added → insertManyFromSource - expect(insertSpy).toHaveBeenCalledWith( - expect.objectContaining({ metadata: { wavelength: {} } }), - ); - }); - - it("does nothing when both docs have empty metadata", async () => { - const deleteSpy = jest.spyOn(service, "deleteMany").mockResolvedValue(); - const insertSpy = jest + it("calls deleteMany before insertManyFromSource", async () => { + const callOrder: string[] = []; + jest.spyOn(service, "deleteMany").mockImplementation(async () => { + callOrder.push("deleteMany"); + }); + jest .spyOn(service, "insertManyFromSource") - .mockResolvedValue(); - - const empty = { ...BASE_DOC, metadata: {} }; - await service.replaceManyFromSource(empty, empty); - - expect(deleteSpy).not.toHaveBeenCalled(); - expect(insertSpy).not.toHaveBeenCalled(); - }); - }); - - // ------------------------------------------------------------------------- - // updateSharedKeys (tested via replaceManyFromSource) - // ------------------------------------------------------------------------- - - describe("updateSharedKeys (via replaceManyFromSource)", () => { - it("increments added groups and decrements removed groups", async () => { - const oldDoc: MetadataSourceDoc = { - ...BASE_DOC, - userGroups: ["group-1"], - metadata: { temperature: { human_name: "Temperature" } }, - }; - const newDoc: MetadataSourceDoc = { - ...BASE_DOC, - userGroups: ["group-2"], - metadata: { temperature: { human_name: "Temperature" } }, - }; - - await service.replaceManyFromSource(oldDoc, newDoc); - - const [, update] = modelMock.updateMany.mock.calls[0]; - expect(update.$inc["userGroupCounts.group-2"]).toBe(1); - expect(update.$inc["userGroupCounts.group-1"]).toBe(-1); - }); - - it("runs RECOMPUTE_USER_GROUPS_STAGE when groups are removed", async () => { - const oldDoc: MetadataSourceDoc = { - ...BASE_DOC, - userGroups: ["group-1", "group-2"], - metadata: { temperature: { human_name: "Temperature" } }, - }; - const newDoc: MetadataSourceDoc = { - ...BASE_DOC, - userGroups: ["group-1"], - metadata: { temperature: { human_name: "Temperature" } }, - }; - - await service.replaceManyFromSource(oldDoc, newDoc); - - // two updateMany calls: one for the $inc, one for RECOMPUTE - expect(modelMock.updateMany).toHaveBeenCalledTimes(2); - }); - - it("does not run RECOMPUTE_USER_GROUPS_STAGE when only groups are added", async () => { - const oldDoc: MetadataSourceDoc = { - ...BASE_DOC, - userGroups: ["group-1"], - metadata: { temperature: { human_name: "Temperature" } }, - }; - const newDoc: MetadataSourceDoc = { - ...BASE_DOC, - userGroups: ["group-1", "group-2"], - metadata: { temperature: { human_name: "Temperature" } }, - }; - - await service.replaceManyFromSource(oldDoc, newDoc); - - // only one updateMany — no RECOMPUTE needed when nothing is removed - expect(modelMock.updateMany).toHaveBeenCalledTimes(1); - }); - - it("sets isPublished when it flips from false to true", async () => { - const oldDoc: MetadataSourceDoc = { - ...BASE_DOC, - isPublished: false, - metadata: { temperature: { human_name: "Temperature" } }, - }; - const newDoc: MetadataSourceDoc = { - ...BASE_DOC, - isPublished: true, - metadata: { temperature: { human_name: "Temperature" } }, - }; + .mockImplementation(async () => { + callOrder.push("insertManyFromSource"); + }); - await service.replaceManyFromSource(oldDoc, newDoc); + await service.replaceManyFromSource(BASE_DOC, BASE_DOC); - const [, update] = modelMock.updateMany.mock.calls[0]; - expect(update.$set.isPublished).toBe(true); + expect(callOrder).toEqual(["deleteMany", "insertManyFromSource"]); }); - it("does not touch isPublished when it flips from true to false", async () => { - const oldDoc: MetadataSourceDoc = { - ...BASE_DOC, - isPublished: true, - metadata: { temperature: { human_name: "Temperature" } }, - }; - const newDoc: MetadataSourceDoc = { - ...BASE_DOC, - isPublished: false, - metadata: { temperature: { human_name: "Temperature" } }, - }; - - await service.replaceManyFromSource(oldDoc, newDoc); - - // no group changes and no publishedFlippedOn → updateMany not called at all - expect(modelMock.updateMany).not.toHaveBeenCalled(); - }); - - it("treats humanReadableName change as delete + insert", async () => { - const deleteSpy = jest.spyOn(service, "deleteMany").mockResolvedValue(); - const insertSpy = jest - .spyOn(service, "insertManyFromSource") - .mockResolvedValue(); - - const oldDoc: MetadataSourceDoc = { + it("net usageCount is zero for unchanged keys", async () => { + const doc = { ...BASE_DOC, metadata: { temperature: { human_name: "Temperature" } }, }; - const newDoc: MetadataSourceDoc = { - ...BASE_DOC, - metadata: { temperature: { human_name: "Temp (renamed)" } }, - }; - await service.replaceManyFromSource(oldDoc, newDoc); + await service.replaceManyFromSource(doc, doc); - expect(deleteSpy).toHaveBeenCalledWith( - expect.objectContaining({ - metadata: { temperature: { human_name: "Temperature" } }, - }), + const allOps = modelMock.bulkWrite.mock.calls.flatMap( + ([ops]: any) => ops, ); - expect(insertSpy).toHaveBeenCalledWith( - expect.objectContaining({ - metadata: { temperature: { human_name: "Temp (renamed)" } }, - }), + const totalUsageCountDelta = allOps.reduce( + (sum: number, op: any) => sum + op.updateOne.update.$inc.usageCount, + 0, ); + expect(totalUsageCountDelta).toBe(0); }); }); }); diff --git a/src/metadata-keys/metadatakeys.service.ts b/src/metadata-keys/metadatakeys.service.ts index bcb1978ab..fedbd0963 100644 --- a/src/metadata-keys/metadatakeys.service.ts +++ b/src/metadata-keys/metadatakeys.service.ts @@ -36,14 +36,13 @@ const RECOMPUTE_USER_GROUPS_STAGE = [ }, }, }, + }, + }, + { + $set: { userGroups: { $map: { - input: { - $filter: { - input: { $objectToArray: "$userGroupCounts" }, - cond: { $gt: ["$$this.v", 0] }, - }, - }, + input: { $objectToArray: "$userGroupCounts" }, as: "entry", in: "$$entry.k", }, @@ -65,10 +64,11 @@ export class MetadataKeysService { ): Promise { const whereFilter: FilterQuery = filter.where ?? {}; const fieldsProjection: string[] = filter.fields ?? {}; - const limits: QueryOptions = filter.limits ?? { - limit: 100, - skip: 0, - sort: { createdAt: "desc" }, + + const limits: QueryOptions = { + limit: filter.limits?.limit ?? 100, + skip: filter.limits?.skip ?? 0, + sort: filter.limits?.sort ?? { createdAt: "desc" }, }; const pipeline: PipelineStage[] = [ @@ -100,157 +100,79 @@ export class MetadataKeysService { return data; } - /** - * Called when a dataset is created or gains new metadata keys. - * - * For each key: - * - Creates a new MetadataKey entry if none exists (upsert) - * - Increments usageCount and per-group reference counts - * - Adds new groups to the userGroups query array - * - Sets isPublished if the source dataset is published (never unsets inline) - * - Updates humanReadableName to the latest value - */ async insertManyFromSource(doc: MetadataSourceDoc): Promise { - if (isEmpty(doc.metadata)) return; - - const { sourceType, userGroups, isPublished, metadata } = doc; - - const ops = Object.entries(metadata).map(([key, entry]) => { - const humanReadableName = - (entry as ScientificMetadataEntry).human_name ?? ""; - - return this.metadataKeyModel.findOneAndUpdate( - { sourceType, key, humanReadableName }, - { - $setOnInsert: addCreatedByFields( - { key, sourceType, humanReadableName }, - "system", - ), - $set: { - ...(isPublished && { isPublished: true }), - }, - $addToSet: { userGroups: { $each: userGroups } }, - $inc: { - usageCount: 1, - ...this.groupCountDeltas(userGroups, 1), - }, - }, - { upsert: true }, - ); - }); - - await Promise.all(ops); + await this.adjustCounts(doc, 1); + } - Logger.log( - `Upserted MetadataKeys for ${sourceType}: ${Object.keys(metadata).join(", ")}`, - ); + async deleteMany(doc: MetadataSourceDoc): Promise { + await this.adjustCounts(doc, -1); } - /** - * Called when a dataset is deleted or loses metadata keys. - * - * Three-step process to ensure groups are safely removed: - * 1. Decrement usageCount and per-group reference counts - * 2. Recompute userGroups array from the updated counts - * (drops any group whose count reached zero) - * 3. Delete entries no longer referenced by any dataset - * + async replaceManyFromSource( + oldDoc: MetadataSourceDoc, + newDoc: MetadataSourceDoc, + ): Promise { + await this.deleteMany(oldDoc); + await this.insertManyFromSource(newDoc); + } - */ - async deleteMany(doc: MetadataSourceDoc): Promise { + private async adjustCounts( + doc: MetadataSourceDoc, + delta: 1 | -1, + ): Promise { if (isEmpty(doc.metadata)) return; - const { sourceType, userGroups, metadata } = doc; - const keys = Object.keys(metadata); - const filters = keys.map((key) => { + const { sourceType, userGroups, isPublished, metadata } = doc; + + const filters = Object.entries(metadata).map(([key, entry]) => { const humanReadableName = - (metadata[key] as ScientificMetadataEntry)?.human_name ?? ""; - return this.buildFilter(sourceType, key, humanReadableName); + (entry as ScientificMetadataEntry).human_name ?? ""; + return { sourceType, key, humanReadableName }; }); + const queryFilter = { $or: filters }; - await this.metadataKeyModel.updateMany(queryFilter, { - $inc: { - usageCount: -1, - ...this.groupCountDeltas(userGroups, -1), + const ops = filters.map(({ sourceType, key, humanReadableName }) => ({ + updateOne: { + filter: { sourceType, key, humanReadableName }, + update: { + $inc: { + usageCount: delta, + ...this.groupCountDeltas(userGroups, delta), + }, + ...(delta === 1 && { + $max: { isPublished }, + $addToSet: { userGroups: { $each: userGroups } }, + $setOnInsert: addCreatedByFields( + { key, sourceType, humanReadableName }, + "system", + ), + }), + }, + upsert: delta === 1, }, - }); + })); - await this.metadataKeyModel.updateMany( - queryFilter, - RECOMPUTE_USER_GROUPS_STAGE, - ); + await this.metadataKeyModel.bulkWrite(ops); - await this.metadataKeyModel.deleteMany({ - $and: [queryFilter, { usageCount: { $lte: 0 } }], - }); + if (delta === -1) { + // UpdateMany is necessary here because the bulkWrite above only decrements userGroupCounts + // but cannot remove zero-count groups from userGroups in the same operation. + await this.metadataKeyModel.updateMany( + queryFilter, + RECOMPUTE_USER_GROUPS_STAGE, + ); + + await this.metadataKeyModel.deleteMany({ + $and: [queryFilter, { usageCount: { $lte: 0 } }], + }); + } Logger.log( - `Decremented or deleted MetadataKeys for ${sourceType}: ${keys.join(", ")}`, + `${delta === 1 ? "Upserted" : "Decremented or deleted"} MetadataKeys for ${sourceType}: ${Object.keys(metadata).join(", ")}`, ); } - /** - * Called when a dataset is updated. - * - * Diffs the old and new metadata to produce three disjoint key sets: - * - added: only in newDoc(after change) → insertManyFromSource - * - removed: only in oldDoc(before change) → deleteMany - * - shared: in both → updateSharedKeys (handles group / isPublished - * / humanReadableName changes) - * - * The three sets are disjoint by _id so Promise.all is safe. - */ - async replaceManyFromSource( - oldDoc: MetadataSourceDoc, - newDoc: MetadataSourceDoc, - ): Promise { - const oldKeys = Object.keys(oldDoc.metadata ?? {}); - const newKeys = Object.keys(newDoc.metadata ?? {}); - - const addedKeys = newKeys.filter((k) => !oldKeys.includes(k)); - const removedKeys = oldKeys.filter((k) => !newKeys.includes(k)); - const sharedKeys = newKeys.filter((k) => oldKeys.includes(k)); - - await Promise.all([ - removedKeys.length > 0 && - this.deleteMany({ - ...oldDoc, - metadata: Object.fromEntries( - removedKeys.map((k) => [k, oldDoc.metadata[k]]), - ), - }), - - addedKeys.length > 0 && - this.insertManyFromSource({ - ...newDoc, - metadata: Object.fromEntries( - addedKeys.map((k) => [k, newDoc.metadata[k]]), - ), - }), - - sharedKeys.length > 0 && - this.updateSharedKeys(sharedKeys, oldDoc, newDoc), - ]); - } - - private buildFilter( - sourceType: string, - key: string, - humanReadableName: string, - ): FilterQuery { - return { sourceType, key, humanReadableName }; - } - /** - * Builds $inc paths for per-group reference counts. - * - * groupCountDeltas(["groupA", "groupB"], 1) - * => { "userGroupCounts.groupA": 1, "userGroupCounts.groupB": 1 } - * - * MongoDB's dot-notation $inc creates the key if missing and increments - * if it already exists, making this safe for both first-time and - * subsequent upserts. - */ private groupCountDeltas( groups: string[], delta: 1 | -1, @@ -259,97 +181,4 @@ export class MetadataKeysService { groups.map((g) => [`userGroupCounts.${g}`, delta]), ); } - - /** - * Handles keys present in both the old and new dataset version. - * - * Checks three things that may have changed independently: - * 1. userGroups — increments added groups, decrements removed groups, - * then recomputes the userGroups array from the updated counts - * 2. isPublished — only sets true inline; false is left to the cronjob - * 3. humanReadableName — updated per-key where it differs - */ - private async updateSharedKeys( - sharedKeys: string[], - oldDoc: MetadataSourceDoc, - newDoc: MetadataSourceDoc, - ): Promise { - const { sourceType } = newDoc; - const humanNameChangedKeys: string[] = []; - const humanNameUnchangedKeys: string[] = []; - - for (const k of sharedKeys) { - const oldHumanName = - (oldDoc.metadata[k] as ScientificMetadataEntry)?.human_name ?? ""; - const newHumanName = - (newDoc.metadata[k] as ScientificMetadataEntry)?.human_name ?? ""; - - if (oldHumanName === newHumanName) { - humanNameUnchangedKeys.push(k); - } else { - humanNameChangedKeys.push(k); - } - } - - const addedGroups = newDoc.userGroups.filter( - (g) => !oldDoc.userGroups.includes(g), - ); - const removedGroups = oldDoc.userGroups.filter( - (g) => !newDoc.userGroups.includes(g), - ); - const publishedFlippedOn = !oldDoc.isPublished && newDoc.isPublished; - const hasGroupChanges = addedGroups.length > 0 || removedGroups.length > 0; - - if ( - humanNameUnchangedKeys.length > 0 && - (hasGroupChanges || publishedFlippedOn) - ) { - const filters = humanNameUnchangedKeys.map((k) => { - const humanReadableName = - (newDoc.metadata[k] as ScientificMetadataEntry)?.human_name ?? ""; - return this.buildFilter(sourceType, k, humanReadableName); - }); - - const queryFilter = { $or: filters }; - - await this.metadataKeyModel.updateMany(queryFilter, { - ...(addedGroups.length > 0 && { - $addToSet: { userGroups: { $each: addedGroups } }, - }), - $inc: { - ...(addedGroups.length > 0 && this.groupCountDeltas(addedGroups, 1)), - ...(removedGroups.length > 0 && - this.groupCountDeltas(removedGroups, -1)), - }, - ...(publishedFlippedOn && { $set: { isPublished: true } }), - }); - - // Recompute userGroups to drop groups whose count reached zero - if (removedGroups.length > 0) { - await this.metadataKeyModel.updateMany( - queryFilter, - RECOMPUTE_USER_GROUPS_STAGE, - ); - } - } - - // humanReadableName is part of _id, so a change means a different document. - // Treat it as delete the old entry + insert the new one. - if (humanNameChangedKeys.length > 0) { - await Promise.all([ - this.deleteMany({ - ...oldDoc, - metadata: Object.fromEntries( - humanNameChangedKeys.map((k) => [k, oldDoc.metadata[k]]), - ), - }), - this.insertManyFromSource({ - ...newDoc, - metadata: Object.fromEntries( - humanNameChangedKeys.map((k) => [k, newDoc.metadata[k]]), - ), - }), - ]); - } - } } From 665f3925fecc934c43f671b9960b78a6025e5d48 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 11 May 2026 14:05:59 +0200 Subject: [PATCH 26/29] set createdBy, updatedBy on insert and updatedAt on every upsert in MetadataKeys --- src/metadata-keys/metadatakeys.service.spec.ts | 12 ------------ src/metadata-keys/metadatakeys.service.ts | 8 ++++---- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/src/metadata-keys/metadatakeys.service.spec.ts b/src/metadata-keys/metadatakeys.service.spec.ts index fe959638e..812be350a 100644 --- a/src/metadata-keys/metadatakeys.service.spec.ts +++ b/src/metadata-keys/metadatakeys.service.spec.ts @@ -225,18 +225,6 @@ describe("MetadataKeysService", () => { expect(update.$max?.isPublished).toBeFalsy(); }); - it("uses $setOnInsert for immutable fields", async () => { - await service.insertManyFromSource(BASE_DOC); - - const [ops] = modelMock.bulkWrite.mock.calls[0]; - const { update } = ops[0].updateOne; - expect(update.$setOnInsert).toMatchObject({ - key: expect.any(String), - sourceType: "Dataset", - humanReadableName: expect.any(String), - }); - }); - it("sets upsert: true", async () => { await service.insertManyFromSource(BASE_DOC); diff --git a/src/metadata-keys/metadatakeys.service.ts b/src/metadata-keys/metadatakeys.service.ts index fedbd0963..ce2da4f5d 100644 --- a/src/metadata-keys/metadatakeys.service.ts +++ b/src/metadata-keys/metadatakeys.service.ts @@ -136,6 +136,9 @@ export class MetadataKeysService { updateOne: { filter: { sourceType, key, humanReadableName }, update: { + $set: { + updatedAt: new Date(), + }, $inc: { usageCount: delta, ...this.groupCountDeltas(userGroups, delta), @@ -143,10 +146,7 @@ export class MetadataKeysService { ...(delta === 1 && { $max: { isPublished }, $addToSet: { userGroups: { $each: userGroups } }, - $setOnInsert: addCreatedByFields( - { key, sourceType, humanReadableName }, - "system", - ), + $setOnInsert: addCreatedByFields({}, "system"), }), }, upsert: delta === 1, From fb5dfb07e7792c9e978d11b180d96997541631f2 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 11 May 2026 14:14:09 +0200 Subject: [PATCH 27/29] fix minor typescript import errors --- src/datasets/datasets.service.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/datasets/datasets.service.ts b/src/datasets/datasets.service.ts index 1603eac94..48f75d2f4 100644 --- a/src/datasets/datasets.service.ts +++ b/src/datasets/datasets.service.ts @@ -67,13 +67,14 @@ import { MetadataSourceDoc, } from "src/metadata-keys/metadatakeys.service"; import { OpensearchService } from "src/opensearch/opensearch.service"; -import type { IndexSettings } from "@opensearch-project/opensearch/api/_types/indices._common"; -import type { TypeMapping } from "@opensearch-project/opensearch/api/_types/_common.mapping"; -import { BulkStats } from "@opensearch-project/opensearch/lib/Helpers"; + import { DatasetOpenSearchDto } from "src/opensearch/dto/dataset-opensearch.dto"; import { plainToInstance } from "class-transformer"; import { DATASET_OPENSEARCH_PROJECTION } from "../opensearch/utils/dataset-opensearch.utils"; import { withOCCFilter } from "./utils/occ-util"; +import { BulkStats } from "@opensearch-project/opensearch/lib/Helpers.js"; +import { IndexSettings } from "@opensearch-project/opensearch/api/_types/indices._common.js"; +import { TypeMapping } from "@opensearch-project/opensearch/api/_types/_common.mapping.js"; @Injectable({ scope: Scope.REQUEST }) export class DatasetsService { From 6ff971d3e64d6de51bf4ff7c92b5f7af4322f567 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 11 May 2026 14:14:54 +0200 Subject: [PATCH 28/29] reorder imports --- src/datasets/datasets.service.ts | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/datasets/datasets.service.ts b/src/datasets/datasets.service.ts index 48f75d2f4..bd2a32204 100644 --- a/src/datasets/datasets.service.ts +++ b/src/datasets/datasets.service.ts @@ -67,15 +67,13 @@ import { MetadataSourceDoc, } from "src/metadata-keys/metadatakeys.service"; import { OpensearchService } from "src/opensearch/opensearch.service"; - +import { BulkStats } from "@opensearch-project/opensearch/lib/Helpers.js"; +import { IndexSettings } from "@opensearch-project/opensearch/api/_types/indices._common.js"; +import { TypeMapping } from "@opensearch-project/opensearch/api/_types/_common.mapping.js"; import { DatasetOpenSearchDto } from "src/opensearch/dto/dataset-opensearch.dto"; import { plainToInstance } from "class-transformer"; import { DATASET_OPENSEARCH_PROJECTION } from "../opensearch/utils/dataset-opensearch.utils"; import { withOCCFilter } from "./utils/occ-util"; -import { BulkStats } from "@opensearch-project/opensearch/lib/Helpers.js"; -import { IndexSettings } from "@opensearch-project/opensearch/api/_types/indices._common.js"; -import { TypeMapping } from "@opensearch-project/opensearch/api/_types/_common.mapping.js"; - @Injectable({ scope: Scope.REQUEST }) export class DatasetsService { private readonly osDefaultIndex: string; From 1d4f99ad2f7448f0f0eddcf0df42b0fc3570ef46 Mon Sep 17 00:00:00 2001 From: junjiequan Date: Mon, 11 May 2026 15:07:18 +0200 Subject: [PATCH 29/29] add shared createMetadataKeysInstance function to reduce duplication --- src/common/utils.ts | 32 +++++++++++++++++ src/datasets/datasets.service.ts | 50 +++++++++++++------------- src/instruments/instruments.service.ts | 43 ++++++++++------------ src/proposals/proposals.service.ts | 48 ++++++++++--------------- src/samples/samples.service.ts | 39 +++++++++----------- 5 files changed, 112 insertions(+), 100 deletions(-) diff --git a/src/common/utils.ts b/src/common/utils.ts index 8c6b67ea0..90eb96f75 100644 --- a/src/common/utils.ts +++ b/src/common/utils.ts @@ -12,6 +12,7 @@ import { import { ScientificRelation } from "./scientific-relation.enum"; import { DatasetType } from "src/datasets/types/dataset-type.enum"; import { isPlainObject, mapValues, omit, pickBy, some } from "lodash"; +import { MetadataSourceDoc } from "src/metadata-keys/metadatakeys.service"; // add Å to mathjs accepted units as equivalent to angstrom const isAlphaOriginal = Unit.isValidAlpha; @@ -1343,3 +1344,34 @@ export function parseDate(dateString?: string): Date | undefined { const parsedDate = new Date(dateString); return isNaN(parsedDate.getTime()) ? undefined : parsedDate; } + +export function createMetadataKeysInstance( + sourceType: string, + doc: { + ownerGroup?: string; + accessGroups?: string[]; + isPublished?: boolean; + scientificMetadata?: Record; + metadata?: Record; + customMetadata?: Record; + sampleCharacteristics?: Record; + }, +): MetadataSourceDoc { + return { + sourceType, + userGroups: Array.from( + new Set( + [doc.ownerGroup, ...(doc.accessGroups ?? [])].filter( + Boolean, + ) as string[], + ), + ), + isPublished: doc.isPublished ?? false, + metadata: + doc.scientificMetadata ?? + doc.metadata ?? + doc.customMetadata ?? + doc.sampleCharacteristics ?? + {}, + }; +} diff --git a/src/datasets/datasets.service.ts b/src/datasets/datasets.service.ts index bd2a32204..2b44783e4 100644 --- a/src/datasets/datasets.service.ts +++ b/src/datasets/datasets.service.ts @@ -36,6 +36,7 @@ import { parsePipelineProjection, parsePipelineSort, decodeMetadataKeyStrings, + createMetadataKeysInstance, } from "src/common/utils"; import { DatasetsAccessService } from "./datasets-access.service"; import { CreateDatasetDto } from "./dto/create-dataset.dto"; @@ -62,10 +63,7 @@ import { DatasetLookupKeysEnum, } from "./types/dataset-lookup"; import { ProposalsService } from "src/proposals/proposals.service"; -import { - MetadataKeysService, - MetadataSourceDoc, -} from "src/metadata-keys/metadatakeys.service"; +import { MetadataKeysService } from "src/metadata-keys/metadatakeys.service"; import { OpensearchService } from "src/opensearch/opensearch.service"; import { BulkStats } from "@opensearch-project/opensearch/lib/Helpers.js"; import { IndexSettings } from "@opensearch-project/opensearch/api/_types/indices._common.js"; @@ -99,20 +97,6 @@ export class DatasetsService { this.configService.get("opensearch.dataSyncBatchSize") || 1000; } - private createMetadataKeysInstance( - doc: UpdateQuery, - ): MetadataSourceDoc { - const source: MetadataSourceDoc = { - sourceType: this.datasetModel.collection.name, - userGroups: Array.from( - new Set([doc.ownerGroup, ...(doc.accessGroups ?? [])].filter(Boolean)), - ), - isPublished: doc.isPublished || false, - metadata: doc.scientificMetadata ?? {}, - }; - return source; - } - addLookupFields( pipeline: PipelineStage[], datasetLookupFields?: (DatasetLookupKeysEnum | IDatasetRelation)[], @@ -217,7 +201,10 @@ export class DatasetsService { } this.metadataKeysService.insertManyFromSource( - this.createMetadataKeysInstance(savedDataset), + createMetadataKeysInstance( + this.datasetModel.collection.name, + savedDataset, + ), ); return savedDataset; @@ -502,8 +489,14 @@ export class DatasetsService { } await this.metadataKeysService.replaceManyFromSource( - this.createMetadataKeysInstance(existingDataset), - this.createMetadataKeysInstance(updatedDataset), + createMetadataKeysInstance( + this.datasetModel.collection.name, + existingDataset, + ), + createMetadataKeysInstance( + this.datasetModel.collection.name, + updatedDataset, + ), ); // we were able to find the dataset and update it return updatedDataset; @@ -560,8 +553,14 @@ export class DatasetsService { } await this.metadataKeysService.replaceManyFromSource( - this.createMetadataKeysInstance(existingDataset), - this.createMetadataKeysInstance(patchedDataset), + createMetadataKeysInstance( + this.datasetModel.collection.name, + existingDataset, + ), + createMetadataKeysInstance( + this.datasetModel.collection.name, + patchedDataset, + ), ); // we were able to find the dataset and update it return patchedDataset; @@ -591,7 +590,10 @@ export class DatasetsService { // delete metadata keys associated with this dataset await this.metadataKeysService.deleteMany( - this.createMetadataKeysInstance(deletedDataset), + createMetadataKeysInstance( + this.datasetModel.collection.name, + deletedDataset, + ), ); return deletedDataset; diff --git a/src/instruments/instruments.service.ts b/src/instruments/instruments.service.ts index c6568c52b..8b410e9cb 100644 --- a/src/instruments/instruments.service.ts +++ b/src/instruments/instruments.service.ts @@ -6,13 +6,14 @@ import { PreconditionFailedException, } from "@nestjs/common"; import { InjectModel } from "@nestjs/mongoose"; -import { FilterQuery, Model, UpdateQuery } from "mongoose"; +import { FilterQuery, Model } from "mongoose"; import { IFilters } from "src/common/interfaces/common.interface"; import { CountApiResponse } from "src/common/types"; import { parseLimitFilters, addCreatedByFields, addUpdatedByField, + createMetadataKeysInstance, } from "src/common/utils"; import { CreateInstrumentDto } from "./dto/create-instrument.dto"; import { PartialUpdateInstrumentDto } from "./dto/update-instrument.dto"; @@ -20,10 +21,7 @@ import { Instrument, InstrumentDocument } from "./schemas/instrument.schema"; import { JWTUser } from "src/auth/interfaces/jwt-user.interface"; import { REQUEST } from "@nestjs/core"; import { Request } from "express"; -import { - MetadataKeysService, - MetadataSourceDoc, -} from "src/metadata-keys/metadatakeys.service"; +import { MetadataKeysService } from "src/metadata-keys/metadatakeys.service"; import { withOCCFilter } from "src/datasets/utils/occ-util"; @Injectable({ scope: Scope.REQUEST }) @@ -35,29 +33,17 @@ export class InstrumentsService { @Inject(REQUEST) private request: Request, ) {} - private createMetadataKeysInstance( - doc: UpdateQuery, - ): MetadataSourceDoc { - const source: MetadataSourceDoc = { - sourceType: this.instrumentModel.collection.name, - userGroups: Array.from( - new Set([doc.ownerGroup, ...(doc.accessGroups ?? [])].filter(Boolean)), - ), - isPublished: doc.isPublished || false, - metadata: doc.customMetadata ?? {}, - }; - return source; - } - async create(createInstrumentDto: CreateInstrumentDto): Promise { const username = (this.request.user as JWTUser).username; const createdInstrument = new this.instrumentModel( addCreatedByFields(createInstrumentDto, username), ); const savedInstrument = await createdInstrument.save(); - await this.metadataKeysService.insertManyFromSource( - this.createMetadataKeysInstance(savedInstrument), + createMetadataKeysInstance(this.instrumentModel.collection.name, { + ...savedInstrument.toObject(), + isPublished: true, + }), ); return savedInstrument; @@ -141,8 +127,14 @@ export class InstrumentsService { } await this.metadataKeysService.replaceManyFromSource( - this.createMetadataKeysInstance(existingInstrument), - this.createMetadataKeysInstance(updatedInstrument), + createMetadataKeysInstance(this.instrumentModel.collection.name, { + ...existingInstrument.toObject(), + isPublished: true, + }), + createMetadataKeysInstance(this.instrumentModel.collection.name, { + ...updatedInstrument.toObject(), + isPublished: true, + }), ); return updatedInstrument; @@ -160,7 +152,10 @@ export class InstrumentsService { } await this.metadataKeysService.deleteMany( - this.createMetadataKeysInstance(deletedInstrument), + createMetadataKeysInstance( + this.instrumentModel.collection.name, + deletedInstrument, + ), ); return deletedInstrument; diff --git a/src/proposals/proposals.service.ts b/src/proposals/proposals.service.ts index ba55d30b9..6dcdf02bd 100644 --- a/src/proposals/proposals.service.ts +++ b/src/proposals/proposals.service.ts @@ -9,13 +9,7 @@ import { import { REQUEST } from "@nestjs/core"; import { Request } from "express"; import { InjectModel } from "@nestjs/mongoose"; -import { - FilterQuery, - Model, - PipelineStage, - QueryOptions, - UpdateQuery, -} from "mongoose"; +import { FilterQuery, Model, PipelineStage, QueryOptions } from "mongoose"; import { IFacets, IFilters } from "src/common/interfaces/common.interface"; import { createFullfacetPipeline, @@ -26,6 +20,7 @@ import { parsePipelineSort, addCreatedByFields, addUpdatedByField, + createMetadataKeysInstance, } from "src/common/utils"; import { isEmpty } from "lodash"; import { @@ -43,10 +38,7 @@ import { IProposalFields } from "./interfaces/proposal-filters.interface"; import { ProposalClass, ProposalDocument } from "./schemas/proposal.schema"; import { JWTUser } from "src/auth/interfaces/jwt-user.interface"; import { CreateMeasurementPeriodDto } from "./dto/create-measurement-period.dto"; -import { - MetadataKeysService, - MetadataSourceDoc, -} from "src/metadata-keys/metadatakeys.service"; +import { MetadataKeysService } from "src/metadata-keys/metadatakeys.service"; import { withOCCFilter } from "src/datasets/utils/occ-util"; @Injectable({ scope: Scope.REQUEST }) @@ -177,20 +169,6 @@ export class ProposalsService { } } - private createMetadataKeysInstance( - doc: UpdateQuery, - ): MetadataSourceDoc { - const source: MetadataSourceDoc = { - sourceType: this.proposalModel.collection.name, - userGroups: Array.from( - new Set([doc.ownerGroup, ...(doc.accessGroups ?? [])].filter(Boolean)), - ), - isPublished: doc.isPublished || false, - metadata: doc.metadata ?? {}, - }; - return source; - } - async create(createProposalDto: CreateProposalDto): Promise { const username = (this.request.user as JWTUser).username; if (createProposalDto.MeasurementPeriodList) { @@ -208,7 +186,10 @@ export class ProposalsService { const savedProposal = await createdProposal.save(); this.metadataKeysService.insertManyFromSource( - this.createMetadataKeysInstance(savedProposal), + createMetadataKeysInstance( + this.proposalModel.collection.name, + savedProposal, + ), ); return savedProposal; } @@ -326,8 +307,14 @@ export class ProposalsService { } await this.metadataKeysService.replaceManyFromSource( - this.createMetadataKeysInstance(existingProposal), - this.createMetadataKeysInstance(updatedProposal), + createMetadataKeysInstance( + this.proposalModel.collection.name, + existingProposal, + ), + createMetadataKeysInstance( + this.proposalModel.collection.name, + updatedProposal, + ), ); return updatedProposal; @@ -345,7 +332,10 @@ export class ProposalsService { } await this.metadataKeysService.deleteMany( - this.createMetadataKeysInstance(deletedProposal), + createMetadataKeysInstance( + this.proposalModel.collection.name, + deletedProposal, + ), ); return deletedProposal; diff --git a/src/samples/samples.service.ts b/src/samples/samples.service.ts index 9f3771431..4bc6d6580 100644 --- a/src/samples/samples.service.ts +++ b/src/samples/samples.service.ts @@ -9,7 +9,7 @@ import { ConfigService } from "@nestjs/config"; import { REQUEST } from "@nestjs/core"; import { Request } from "express"; import { InjectModel } from "@nestjs/mongoose"; -import { FilterQuery, Model, QueryOptions, UpdateQuery } from "mongoose"; +import { FilterQuery, Model, QueryOptions } from "mongoose"; import { JWTUser } from "src/auth/interfaces/jwt-user.interface"; import { IFilters } from "src/common/interfaces/common.interface"; import { @@ -19,6 +19,7 @@ import { extractMetadataKeys, parseLimitFilters, decodeMetadataKeyStrings, + createMetadataKeysInstance, } from "src/common/utils"; import { CreateSampleDto } from "./dto/create-sample.dto"; import { PartialUpdateSampleDto } from "./dto/update-sample.dto"; @@ -26,10 +27,7 @@ import { ISampleFields } from "./interfaces/sample-filters.interface"; import { SampleClass, SampleDocument } from "./schemas/sample.schema"; import { CountApiResponse } from "src/common/types"; import { OutputSampleDto } from "./dto/output-sample.dto"; -import { - MetadataKeysService, - MetadataSourceDoc, -} from "src/metadata-keys/metadatakeys.service"; +import { MetadataKeysService } from "src/metadata-keys/metadatakeys.service"; import { withOCCFilter } from "src/datasets/utils/occ-util"; @Injectable({ scope: Scope.REQUEST }) @@ -41,20 +39,6 @@ export class SamplesService { @Inject(REQUEST) private request: Request, ) {} - private createMetadataKeysInstance( - doc: UpdateQuery, - ): MetadataSourceDoc { - const source: MetadataSourceDoc = { - sourceType: this.sampleModel.collection.name, - userGroups: Array.from( - new Set([doc.ownerGroup, ...(doc.accessGroups ?? [])].filter(Boolean)), - ), - isPublished: doc.isPublished || false, - metadata: doc.sampleCharacteristics ?? {}, - }; - return source; - } - async create(createSampleDto: CreateSampleDto): Promise { const username = (this.request.user as JWTUser).username; const createdSample = new this.sampleModel( @@ -63,7 +47,7 @@ export class SamplesService { const savedSample = await createdSample.save(); this.metadataKeysService.insertManyFromSource( - this.createMetadataKeysInstance(savedSample), + createMetadataKeysInstance(this.sampleModel.collection.name, savedSample), ); return savedSample; @@ -223,8 +207,14 @@ export class SamplesService { } await this.metadataKeysService.replaceManyFromSource( - this.createMetadataKeysInstance(existingSample), - this.createMetadataKeysInstance(updatedSample), + createMetadataKeysInstance( + this.sampleModel.collection.name, + existingSample, + ), + createMetadataKeysInstance( + this.sampleModel.collection.name, + updatedSample, + ), ); return updatedSample; @@ -242,7 +232,10 @@ export class SamplesService { } this.metadataKeysService.deleteMany( - this.createMetadataKeysInstance(deletedSample), + createMetadataKeysInstance( + this.sampleModel.collection.name, + deletedSample, + ), ); return deletedSample; }