From 3fb5fbd3604870b3644c553a7d95f73dc11e4fca Mon Sep 17 00:00:00 2001 From: Chris Bizon Date: Tue, 6 Sep 2022 09:57:07 -0400 Subject: [PATCH 1/7] node normalization --- docs/normalize_nodes.md | 29 +++ .../messages/01_premerged_message.json | 170 ++++++++++++ .../messages/02_postmerged_message.json | 245 ++++++++++++++++++ operations/normalize_nodes.yml | 15 ++ requirements.txt | 2 + 5 files changed, 461 insertions(+) create mode 100644 docs/normalize_nodes.md create mode 100644 examples/normalize_nodes/messages/01_premerged_message.json create mode 100644 examples/normalize_nodes/messages/02_postmerged_message.json create mode 100644 operations/normalize_nodes.yml diff --git a/docs/normalize_nodes.md b/docs/normalize_nodes.md new file mode 100644 index 0000000..a818e2f --- /dev/null +++ b/docs/normalize_nodes.md @@ -0,0 +1,29 @@ +# normalize nodes + +This operation updates the identifiers on qgraph and kgraph nodes to their preferred identifiers, and adds equivalent identifiers in a property for knodes. When two kgraph nodes normalize to the same preferred identifier, the two knodes are merged. The new node contain the union of the properties of the two original nodes. All edges attached to either of the two original nodes are now subsequently attached to the new merged knode. Qnodes are not merged, so that the structure of the query can be preserved. The updates to kgraph node identifiers also necessitates the updating of result node bindings. + +### examples + +- [input](../examples/normalize_nodes/messages/01_premerged_message.json), [output](../examples/normalize_nodes/messages/02_postmerged_message.json) + +### input requirements + +None + +### output guarantees + +None + +### allowed changes + +- modify qnodes +- modify knodes +- remove knodes +- modify kedges +- modify node bindings + +### parameters + +```yaml +[] +``` diff --git a/examples/normalize_nodes/messages/01_premerged_message.json b/examples/normalize_nodes/messages/01_premerged_message.json new file mode 100644 index 0000000..83398dc --- /dev/null +++ b/examples/normalize_nodes/messages/01_premerged_message.json @@ -0,0 +1,170 @@ +{ + "message": { + "query_graph": { + "nodes": { + "n1": { + "ids": ["HGNC:11603"], + "categories": [ + "biolink:Gene" + ] + }, + "n2": { + "ids": ["NCBIGene:9496"], + "categories": [ + "biolink:Gene" + ] + }, + "n3": { + "ids": ["MONDO:0005002"], + "categories": [ + "biolink:Disease" + ] + }, + "n4": { + "ids": ["DOID:3083"], + "categories": [ + "biolink:Disease" + ] + }, + "n5": { + "categories": [ + "biolink:Disease" + ] + } + }, + "edges": { + "e1": { + "subject": "n1", + "object": "n3" + }, + "e2": { + "subject": "n2", + "object": "n4", + "predicates": ["biolink:related_to"] + }, + "e3": { + "subject": "n1", + "object": "n5" + } + } + }, + "knowledge_graph": { + "nodes": { + "HGNC:11603": { + "name": "TBX4", + "categories": [ + "biolink:Gene" + ] + }, + "NCBIGene:9496": { + "name": "T-box transcription factor 4", + "categories": [ + "biolink:Gene" + ] + }, + "MONDO:0005002": { + "name": "chronic obstructive pulmonary disease", + "categories": [ + "biolink:Disease" + ] + }, + "DOID:3083": { + "name": "chronic obstructive pulmonary disease", + "categories": [ + "biolink:Disease" + ] + }, + "UMLS:CN202575": { + "name": "heritable pulmonary arterial hypertension", + "categories": [ + "biolink:Disease" + ] + } + }, + "edges": { + "a8575c4e-61a6-428a-bf09-fcb3e8d1644d": { + "subject": "HGNC:11603", + "object": "MONDO:0005002", + "predicate": "biolink:related_to" + }, + "2d38345a-e9bf-4943-accb-dccba351dd04": { + "subject": "NCBIGene:9496", + "object": "DOID:3083", + "predicate": "biolink:related_to" + }, + "044a7916-fba9-4b4f-ae48-f0815b0b222d": { + "subject": "HGNC:11603", + "object": "UMLS:CN202575", + "predicate": "biolink:related_to" + } + } + }, + "results": [ + { + "node_bindings": { + "n1": [ + { + "id": "HGNC:11603" + } + ], + "n3": [ + { + "id": "MONDO:0005002" + } + ] + }, + "edge_bindings": { + "e1": [ + { + "id": "a8575c4e-61a6-428a-bf09-fcb3e8d1644d" + } + ] + } + }, + { + "node_bindings": { + "n2": [ + { + "id": "NCBIGene:9496" + } + ], + "n4": [ + { + "id": "DOID:3083" + } + ] + }, + "edge_bindings": { + "e2": [ + { + "id": "2d38345a-e9bf-4943-accb-dccba351dd04" + } + ] + } + }, + { + "node_bindings": { + "n1": [ + { + "id": "HGNC:11603" + } + ], + "n5": [ + { + "id": "UMLS:CN202575" + } + ] + }, + "edge_bindings": { + "e3": [ + { + "id": "044a7916-fba9-4b4f-ae48-f0815b0b222d" + } + ] + } + } + ] + }, + "logs": null, + "status": null +} diff --git a/examples/normalize_nodes/messages/02_postmerged_message.json b/examples/normalize_nodes/messages/02_postmerged_message.json new file mode 100644 index 0000000..caa340a --- /dev/null +++ b/examples/normalize_nodes/messages/02_postmerged_message.json @@ -0,0 +1,245 @@ +{ "workflow": null, + "message": { + "query_graph": { + "nodes": { + "n1": { + "ids": ["NCBIGene:9496"], + "categories": [ + "biolink:Gene" + ], + "is_set": false, + "constraints": [] + }, + "n2": { + "ids": ["NCBIGene:9496"], + "categories": [ + "biolink:Gene" + ], + "is_set": false, + "constraints": [] + }, + "n3": { + "ids": ["MONDO:0005002"], + "categories": [ + "biolink:Disease" + ], + "is_set": false, + "constraints": [] + }, + "n4": { + "ids": ["MONDO:0005002"], + "categories": [ + "biolink:Disease" + ], + "is_set": false, + "constraints": [] + }, + "n5": { + "categories": [ + "biolink:Disease" + ], + "ids": null, + "is_set": false, + "constraints": [] + } + }, + "edges": { + "e1": { + "subject": "n1", + "object": "n3", + "predicates": null, + "constraints": [] + }, + "e2": { + "subject": "n2", + "object": "n4", + "predicates": ["biolink:related_to"], + "constraints": [] + }, + "e3": { + "subject": "n1", + "object": "n5", + "predicates": null, + "constraints": [] + } + } + }, + "knowledge_graph": { + "nodes": { + "NCBIGene:9496": { + "attributes": [ + { + "attribute_type_id": "biolink:same_as", + "attribute_source": null, + "original_attribute_name": "equivalent_identifiers", + "value_url": null, + "value": [ + "NCBIGene:9496", + "ENSEMBL:ENSG00000121075", + "HGNC:11603", + "UniProtKB:P57082" + ], + "description": null, + "value_type_id": "EDAM:data_0006", + "attributes": null + } + ], + "categories": [ + "biolink:Gene", + "biolink:NamedThing", + "biolink:BiologicalEntity", + "biolink:MolecularEntity", + "biolink:GenomicEntity", + "biolink:MacromolecularMachine", + "biolink:GeneOrGeneProduct" + ], + "name": "TBX4" + }, + "MONDO:0005002": { + "attributes": [ + { + "attribute_type_id": "biolink:same_as", + "attribute_source": null, + "original_attribute_name": "equivalent_identifiers", + "value_url": null, + "value": [ + "MONDO:0005002", + "DOID:3083", + "EFO:0000341", + "UMLS:C0024117", + "MESH:D029424", + "NCIT:C3199", + "SNOMEDCT:13645005", + "HP:0006510" + ], + "description": null, + "value_type_id": "EDAM:data_0006", + "attributes": null + } + ], + "categories": [ + "biolink:Disease", + "biolink:NamedThing", + "biolink:BiologicalEntity", + "biolink:DiseaseOrPhenotypicFeature" + ], + "name": "chronic obstructive pulmonary disease" + }, + "MONDO:0017148": { + "attributes": [ + { + "attribute_type_id": "biolink:same_as", + "attribute_source": null, + "original_attribute_name": "equivalent_identifiers", + "value_url": null, + "value": [ + "MONDO:0017148", + "ORPHANET:275777", + "UMLS:CN202575", + "NCIT:C121945", + "SNOMEDCT:697897003" + ], + "description": null, + "value_type_id": "EDAM:data_0006", + "attributes": null + } + ], + "categories": [ + "biolink:Disease", + "biolink:DiseaseOrPhenotypicFeature", + "biolink:BiologicalEntity", + "biolink:NamedThing" + ], + "name": "heritable pulmonary arterial hypertension" + } + }, + "edges": { + "044a7916-fba9-4b4f-ae48-f0815b0b222d": { + "subject": "NCBIGene:9496", + "object": "MONDO:0017148", + "predicate": "biolink:related_to", + "attributes": null + }, + "a8575c4e-61a6-428a-bf09-fcb3e8d1644d": { + "subject": "NCBIGene:9496", + "object": "MONDO:0005002", + "predicate": "biolink:related_to", + "attributes": null + } + } + }, + "results": [ + { + "node_bindings": { + "n1": [ + { + "id": "NCBIGene:9496" + } + ], + "n3": [ + { + "id": "MONDO:0005002" + } + ] + }, + "edge_bindings": { + "e1": [ + { + "id": "a8575c4e-61a6-428a-bf09-fcb3e8d1644d", + "attributes": null + } + ] + }, + "score": null + }, + { + "node_bindings": { + "n2": [ + { + "id": "NCBIGene:9496" + } + ], + "n4": [ + { + "id": "MONDO:0005002" + } + ] + }, + "edge_bindings": { + "e2": [ + { + "id": "a8575c4e-61a6-428a-bf09-fcb3e8d1644d", + "attributes": null + } + ] + }, + "score": null + }, + { + "node_bindings": { + "n1": [ + { + "id": "NCBIGene:9496" + } + ], + "n5": [ + { + "id": "MONDO:0017148" + } + ] + }, + "edge_bindings": { + "e3": [ + { + "id": "044a7916-fba9-4b4f-ae48-f0815b0b222d", + "attributes": null + } + ] + }, + "score": null + } + ] + }, + "logs": null, + "status": null +} diff --git a/operations/normalize_nodes.yml b/operations/normalize_nodes.yml new file mode 100644 index 0000000..9f788e8 --- /dev/null +++ b/operations/normalize_nodes.yml @@ -0,0 +1,15 @@ +id: normalize_nodes +name: normalize nodes +description: This operation updates the identifiers on qgraph and kgraph nodes to their preferred identifiers, and adds equivalent identifiers in a property for knodes. When two kgraph nodes normalize to the same preferred identifier, the two knodes are merged. The new node contain the union of the properties of the two original nodes. All edges attached to either of the two original nodes are now subsequently attached to the new merged knode. Qnodes are not merged, so that the structure of the query can be preserved. The updates to kgraph node identifiers also necessitates the updating of result node bindings. +examples: +- input: normalize_nodes/messages/01_premerged_message.json + output: normalize_nodes/messages/02_postmerged_message.json +input_requirements: [] +output_guarantees: [] +allowed_changes: +- modify qnodes +- modify knodes +- remove knodes +- modify kedges +- modify node bindings +parameters: [] diff --git a/requirements.txt b/requirements.txt index 6b02114..fa8e814 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +#jinja installs markupsafe w/o a version, new versions fail. +MarkupSafe<2.1.0 jinja2==2.11.3 jsonschema==3.2.0 pytest==6.2.3 From 9d08ec6bb2ffe4f86fab6fb896bd146c6d074c66 Mon Sep 17 00:00:00 2001 From: Chris Bizon Date: Tue, 6 Sep 2022 09:58:05 -0400 Subject: [PATCH 2/7] index --- docs/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/index.md b/docs/index.md index 5f9cac4..ae7ce88 100644 --- a/docs/index.md +++ b/docs/index.md @@ -19,6 +19,7 @@ - [filter_results_top_n](./filter_results_top_n.md) - [lookup](./lookup.md) - [lookup_and_score](./lookup_and_score.md) +- [normalize_nodes](./normalize_nodes.md) - [overlay](./overlay.md) - [overlay_compute_jaccard](./overlay_compute_jaccard.md) - [overlay_compute_ngd](./overlay_compute_ngd.md) From 5d3f4bf5168e247bbc61abf7bdb976f4c15a4c22 Mon Sep 17 00:00:00 2001 From: Chris Bizon Date: Thu, 8 Sep 2022 12:14:45 -0400 Subject: [PATCH 3/7] updated PR --- ...1_premerged_message.json => 01_prenormalized_message.json} | 0 ...postmerged_message.json => 02_postnormalized_message.json} | 0 operations/normalize_nodes.yml | 4 ++-- 3 files changed, 2 insertions(+), 2 deletions(-) rename examples/normalize_nodes/messages/{01_premerged_message.json => 01_prenormalized_message.json} (100%) rename examples/normalize_nodes/messages/{02_postmerged_message.json => 02_postnormalized_message.json} (100%) diff --git a/examples/normalize_nodes/messages/01_premerged_message.json b/examples/normalize_nodes/messages/01_prenormalized_message.json similarity index 100% rename from examples/normalize_nodes/messages/01_premerged_message.json rename to examples/normalize_nodes/messages/01_prenormalized_message.json diff --git a/examples/normalize_nodes/messages/02_postmerged_message.json b/examples/normalize_nodes/messages/02_postnormalized_message.json similarity index 100% rename from examples/normalize_nodes/messages/02_postmerged_message.json rename to examples/normalize_nodes/messages/02_postnormalized_message.json diff --git a/operations/normalize_nodes.yml b/operations/normalize_nodes.yml index 9f788e8..9567673 100644 --- a/operations/normalize_nodes.yml +++ b/operations/normalize_nodes.yml @@ -2,8 +2,8 @@ id: normalize_nodes name: normalize nodes description: This operation updates the identifiers on qgraph and kgraph nodes to their preferred identifiers, and adds equivalent identifiers in a property for knodes. When two kgraph nodes normalize to the same preferred identifier, the two knodes are merged. The new node contain the union of the properties of the two original nodes. All edges attached to either of the two original nodes are now subsequently attached to the new merged knode. Qnodes are not merged, so that the structure of the query can be preserved. The updates to kgraph node identifiers also necessitates the updating of result node bindings. examples: -- input: normalize_nodes/messages/01_premerged_message.json - output: normalize_nodes/messages/02_postmerged_message.json +- input: normalize_nodes/messages/01_prenormalized_message.json +- output: normalize_nodes/messages/02_postnormalized_message.json input_requirements: [] output_guarantees: [] allowed_changes: From ef3a17c5ccb79951755d2312bbd8ae1034718200 Mon Sep 17 00:00:00 2001 From: Chris Bizon Date: Thu, 8 Sep 2022 12:15:41 -0400 Subject: [PATCH 4/7] regenerate docs --- docs/normalize_nodes.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/normalize_nodes.md b/docs/normalize_nodes.md index a818e2f..3b3bc61 100644 --- a/docs/normalize_nodes.md +++ b/docs/normalize_nodes.md @@ -4,7 +4,8 @@ This operation updates the identifiers on qgraph and kgraph nodes to their prefe ### examples -- [input](../examples/normalize_nodes/messages/01_premerged_message.json), [output](../examples/normalize_nodes/messages/02_postmerged_message.json) +- [input](../examples/normalize_nodes/messages/01_prenormalized_message.json), [output](../examples/) +- [input](../examples/), [output](../examples/normalize_nodes/messages/02_postnormalized_message.json) ### input requirements From 5e6d48a00159cbf06048239397fdd73a907683d4 Mon Sep 17 00:00:00 2001 From: Chris Bizon Date: Thu, 8 Sep 2022 13:00:41 -0400 Subject: [PATCH 5/7] nonunique --- operations/normalize_nodes.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/operations/normalize_nodes.yml b/operations/normalize_nodes.yml index 9567673..eb94b93 100644 --- a/operations/normalize_nodes.yml +++ b/operations/normalize_nodes.yml @@ -1,6 +1,7 @@ id: normalize_nodes name: normalize nodes description: This operation updates the identifiers on qgraph and kgraph nodes to their preferred identifiers, and adds equivalent identifiers in a property for knodes. When two kgraph nodes normalize to the same preferred identifier, the two knodes are merged. The new node contain the union of the properties of the two original nodes. All edges attached to either of the two original nodes are now subsequently attached to the new merged knode. Qnodes are not merged, so that the structure of the query can be preserved. The updates to kgraph node identifiers also necessitates the updating of result node bindings. +unique: false examples: - input: normalize_nodes/messages/01_prenormalized_message.json - output: normalize_nodes/messages/02_postnormalized_message.json From 1de99c7d1eb9ccc3af4c24ad070fcf61fc29ed01 Mon Sep 17 00:00:00 2001 From: Chris Bizon Date: Thu, 8 Sep 2022 13:13:34 -0400 Subject: [PATCH 6/7] updated schema/docs --- docs/normalize_nodes.md | 2 +- operations/normalize_nodes.yml | 2 +- schema/operation.json | 61 ++++++++++++++++++++++++++++++++-- schema/workflow.json | 4 +-- 4 files changed, 63 insertions(+), 6 deletions(-) diff --git a/docs/normalize_nodes.md b/docs/normalize_nodes.md index 3b3bc61..c0494e0 100644 --- a/docs/normalize_nodes.md +++ b/docs/normalize_nodes.md @@ -26,5 +26,5 @@ None ### parameters ```yaml -[] +{} ``` diff --git a/operations/normalize_nodes.yml b/operations/normalize_nodes.yml index eb94b93..a853f60 100644 --- a/operations/normalize_nodes.yml +++ b/operations/normalize_nodes.yml @@ -13,4 +13,4 @@ allowed_changes: - remove knodes - modify kedges - modify node bindings -parameters: [] +parameters: {} diff --git a/schema/operation.json b/schema/operation.json index 05b5167..b62ed26 100644 --- a/schema/operation.json +++ b/schema/operation.json @@ -1,6 +1,6 @@ { "$schema": "https://json-schema.org/draft-07/schema", - "$id": "https://standards.ncats.io/operation/1.3.2/schema", + "$id": "https://standards.ncats.io/operation/1.4.2/schema", "anyOf": [ { "$ref": "#/$defs/OperationAnnotate" @@ -59,6 +59,9 @@ { "$ref": "#/$defs/OperationLookupAndScore" }, + { + "$ref": "#/$defs/OperationNormalizeNodes" + }, { "$ref": "#/$defs/OperationOverlay" }, @@ -595,7 +598,8 @@ }, "description": "List of operation providers (by infores ID) that may be used to complete operation. No others will be used. A full list of operation providers for each operation with infores ID's is available through the '/services' endpoint of the workflow runner.", "example": [ - "infores:aragorn" ], + "infores:aragorn" + ], "minLength": 1 } } @@ -1496,6 +1500,59 @@ ], "additionalProperties": false }, + "OperationNormalizeNodes": { + "type": "object", + "description": "This operation updates the identifiers on qgraph and kgraph nodes to their preferred identifiers, and adds equivalent identifiers in a property for knodes. When two kgraph nodes normalize to the same preferred identifier, the two knodes are merged. The new node contain the union of the properties of the two original nodes. All edges attached to either of the two original nodes are now subsequently attached to the new merged knode. Qnodes are not merged, so that the structure of the query can be preserved. The updates to kgraph node identifiers also necessitates the updating of result node bindings.", + "properties": { + "id": { + "type": "string", + "enum": [ + "normalize_nodes" + ] + }, + "unique": false, + "parameters": {}, + "runner_parameters": { + "type": "object", + "oneOf": [ + { + "properties": { + "allowlist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of operation providers (by infores ID) that may be used to complete operation. No others will be used. A full list of operation providers for each operation with infores ID's is available through the '/services' endpoint of the workflow runner.", + "example": [ + "infores:aragorn" + ], + "minLength": 1 + } + } + }, + { + "properties": { + "denylist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of operation providers (by infores ID) that may not be used to complete operation. All others will be used. A full list of operation providers for each operation with infores ID's is available through the '/services' endpoint of the worflow runner.", + "example": [ + "infores:aragorn" + ], + "minLength": 1 + } + } + } + ] + } + }, + "required": [ + "id" + ], + "additionalProperties": false + }, "OperationOverlay": { "type": "object", "description": "This operation adds additional qedges and/or kedges and/or result edge bindings.", diff --git a/schema/workflow.json b/schema/workflow.json index ecc36ba..f08bb5b 100644 --- a/schema/workflow.json +++ b/schema/workflow.json @@ -1,8 +1,8 @@ { "$schema": "https://json-schema.org/draft-07/schema", - "$id": "https://standards.ncats.io/workflow/1.3.2/schema", + "$id": "https://standards.ncats.io/workflow/1.4.2/schema", "type": "array", "items": { - "$ref": "https://standards.ncats.io/operation/1.3.2/schema" + "$ref": "https://standards.ncats.io/operation/1.4.2/schema" } } \ No newline at end of file From c5a3be016432ad805c8fac57c731e714203e9c2a Mon Sep 17 00:00:00 2001 From: Chris Bizon Date: Thu, 8 Sep 2022 13:18:44 -0400 Subject: [PATCH 7/7] Updated README with brief change instructions --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 1890800..68d339c 100644 --- a/README.md +++ b/README.md @@ -13,3 +13,6 @@ Initially, design considerations will be discussed and voted on. Eventually, the # Workflow Runner The following repo will, after you stand it up, provide an endpoint to which you can post workflows and registered ARAs will execute the operations they have implemented. https://github.com/NCATSTranslator/workflow-runner + +# New Operations +Creation of a new operation involves generation of the operation definition in the `operations` directory, then rebuilding the docs with `docs/build/generate_docs.py` and the schema with `schema/build/generate_schema.py`