diff --git a/README.md b/README.md index 1890800..68d339c 100644 --- a/README.md +++ b/README.md @@ -13,3 +13,6 @@ Initially, design considerations will be discussed and voted on. Eventually, the # Workflow Runner The following repo will, after you stand it up, provide an endpoint to which you can post workflows and registered ARAs will execute the operations they have implemented. https://github.com/NCATSTranslator/workflow-runner + +# New Operations +Creation of a new operation involves generation of the operation definition in the `operations` directory, then rebuilding the docs with `docs/build/generate_docs.py` and the schema with `schema/build/generate_schema.py` diff --git a/docs/index.md b/docs/index.md index 5f9cac4..ae7ce88 100644 --- a/docs/index.md +++ b/docs/index.md @@ -19,6 +19,7 @@ - [filter_results_top_n](./filter_results_top_n.md) - [lookup](./lookup.md) - [lookup_and_score](./lookup_and_score.md) +- [normalize_nodes](./normalize_nodes.md) - [overlay](./overlay.md) - [overlay_compute_jaccard](./overlay_compute_jaccard.md) - [overlay_compute_ngd](./overlay_compute_ngd.md) diff --git a/docs/normalize_nodes.md b/docs/normalize_nodes.md new file mode 100644 index 0000000..c0494e0 --- /dev/null +++ b/docs/normalize_nodes.md @@ -0,0 +1,30 @@ +# normalize nodes + +This operation updates the identifiers on qgraph and kgraph nodes to their preferred identifiers, and adds equivalent identifiers in a property for knodes. When two kgraph nodes normalize to the same preferred identifier, the two knodes are merged. The new node contain the union of the properties of the two original nodes. All edges attached to either of the two original nodes are now subsequently attached to the new merged knode. Qnodes are not merged, so that the structure of the query can be preserved. The updates to kgraph node identifiers also necessitates the updating of result node bindings. + +### examples + +- [input](../examples/normalize_nodes/messages/01_prenormalized_message.json), [output](../examples/) +- [input](../examples/), [output](../examples/normalize_nodes/messages/02_postnormalized_message.json) + +### input requirements + +None + +### output guarantees + +None + +### allowed changes + +- modify qnodes +- modify knodes +- remove knodes +- modify kedges +- modify node bindings + +### parameters + +```yaml +{} +``` diff --git a/examples/normalize_nodes/messages/01_prenormalized_message.json b/examples/normalize_nodes/messages/01_prenormalized_message.json new file mode 100644 index 0000000..83398dc --- /dev/null +++ b/examples/normalize_nodes/messages/01_prenormalized_message.json @@ -0,0 +1,170 @@ +{ + "message": { + "query_graph": { + "nodes": { + "n1": { + "ids": ["HGNC:11603"], + "categories": [ + "biolink:Gene" + ] + }, + "n2": { + "ids": ["NCBIGene:9496"], + "categories": [ + "biolink:Gene" + ] + }, + "n3": { + "ids": ["MONDO:0005002"], + "categories": [ + "biolink:Disease" + ] + }, + "n4": { + "ids": ["DOID:3083"], + "categories": [ + "biolink:Disease" + ] + }, + "n5": { + "categories": [ + "biolink:Disease" + ] + } + }, + "edges": { + "e1": { + "subject": "n1", + "object": "n3" + }, + "e2": { + "subject": "n2", + "object": "n4", + "predicates": ["biolink:related_to"] + }, + "e3": { + "subject": "n1", + "object": "n5" + } + } + }, + "knowledge_graph": { + "nodes": { + "HGNC:11603": { + "name": "TBX4", + "categories": [ + "biolink:Gene" + ] + }, + "NCBIGene:9496": { + "name": "T-box transcription factor 4", + "categories": [ + "biolink:Gene" + ] + }, + "MONDO:0005002": { + "name": "chronic obstructive pulmonary disease", + "categories": [ + "biolink:Disease" + ] + }, + "DOID:3083": { + "name": "chronic obstructive pulmonary disease", + "categories": [ + "biolink:Disease" + ] + }, + "UMLS:CN202575": { + "name": "heritable pulmonary arterial hypertension", + "categories": [ + "biolink:Disease" + ] + } + }, + "edges": { + "a8575c4e-61a6-428a-bf09-fcb3e8d1644d": { + "subject": "HGNC:11603", + "object": "MONDO:0005002", + "predicate": "biolink:related_to" + }, + "2d38345a-e9bf-4943-accb-dccba351dd04": { + "subject": "NCBIGene:9496", + "object": "DOID:3083", + "predicate": "biolink:related_to" + }, + "044a7916-fba9-4b4f-ae48-f0815b0b222d": { + "subject": "HGNC:11603", + "object": "UMLS:CN202575", + "predicate": "biolink:related_to" + } + } + }, + "results": [ + { + "node_bindings": { + "n1": [ + { + "id": "HGNC:11603" + } + ], + "n3": [ + { + "id": "MONDO:0005002" + } + ] + }, + "edge_bindings": { + "e1": [ + { + "id": "a8575c4e-61a6-428a-bf09-fcb3e8d1644d" + } + ] + } + }, + { + "node_bindings": { + "n2": [ + { + "id": "NCBIGene:9496" + } + ], + "n4": [ + { + "id": "DOID:3083" + } + ] + }, + "edge_bindings": { + "e2": [ + { + "id": "2d38345a-e9bf-4943-accb-dccba351dd04" + } + ] + } + }, + { + "node_bindings": { + "n1": [ + { + "id": "HGNC:11603" + } + ], + "n5": [ + { + "id": "UMLS:CN202575" + } + ] + }, + "edge_bindings": { + "e3": [ + { + "id": "044a7916-fba9-4b4f-ae48-f0815b0b222d" + } + ] + } + } + ] + }, + "logs": null, + "status": null +} diff --git a/examples/normalize_nodes/messages/02_postnormalized_message.json b/examples/normalize_nodes/messages/02_postnormalized_message.json new file mode 100644 index 0000000..caa340a --- /dev/null +++ b/examples/normalize_nodes/messages/02_postnormalized_message.json @@ -0,0 +1,245 @@ +{ "workflow": null, + "message": { + "query_graph": { + "nodes": { + "n1": { + "ids": ["NCBIGene:9496"], + "categories": [ + "biolink:Gene" + ], + "is_set": false, + "constraints": [] + }, + "n2": { + "ids": ["NCBIGene:9496"], + "categories": [ + "biolink:Gene" + ], + "is_set": false, + "constraints": [] + }, + "n3": { + "ids": ["MONDO:0005002"], + "categories": [ + "biolink:Disease" + ], + "is_set": false, + "constraints": [] + }, + "n4": { + "ids": ["MONDO:0005002"], + "categories": [ + "biolink:Disease" + ], + "is_set": false, + "constraints": [] + }, + "n5": { + "categories": [ + "biolink:Disease" + ], + "ids": null, + "is_set": false, + "constraints": [] + } + }, + "edges": { + "e1": { + "subject": "n1", + "object": "n3", + "predicates": null, + "constraints": [] + }, + "e2": { + "subject": "n2", + "object": "n4", + "predicates": ["biolink:related_to"], + "constraints": [] + }, + "e3": { + "subject": "n1", + "object": "n5", + "predicates": null, + "constraints": [] + } + } + }, + "knowledge_graph": { + "nodes": { + "NCBIGene:9496": { + "attributes": [ + { + "attribute_type_id": "biolink:same_as", + "attribute_source": null, + "original_attribute_name": "equivalent_identifiers", + "value_url": null, + "value": [ + "NCBIGene:9496", + "ENSEMBL:ENSG00000121075", + "HGNC:11603", + "UniProtKB:P57082" + ], + "description": null, + "value_type_id": "EDAM:data_0006", + "attributes": null + } + ], + "categories": [ + "biolink:Gene", + "biolink:NamedThing", + "biolink:BiologicalEntity", + "biolink:MolecularEntity", + "biolink:GenomicEntity", + "biolink:MacromolecularMachine", + "biolink:GeneOrGeneProduct" + ], + "name": "TBX4" + }, + "MONDO:0005002": { + "attributes": [ + { + "attribute_type_id": "biolink:same_as", + "attribute_source": null, + "original_attribute_name": "equivalent_identifiers", + "value_url": null, + "value": [ + "MONDO:0005002", + "DOID:3083", + "EFO:0000341", + "UMLS:C0024117", + "MESH:D029424", + "NCIT:C3199", + "SNOMEDCT:13645005", + "HP:0006510" + ], + "description": null, + "value_type_id": "EDAM:data_0006", + "attributes": null + } + ], + "categories": [ + "biolink:Disease", + "biolink:NamedThing", + "biolink:BiologicalEntity", + "biolink:DiseaseOrPhenotypicFeature" + ], + "name": "chronic obstructive pulmonary disease" + }, + "MONDO:0017148": { + "attributes": [ + { + "attribute_type_id": "biolink:same_as", + "attribute_source": null, + "original_attribute_name": "equivalent_identifiers", + "value_url": null, + "value": [ + "MONDO:0017148", + "ORPHANET:275777", + "UMLS:CN202575", + "NCIT:C121945", + "SNOMEDCT:697897003" + ], + "description": null, + "value_type_id": "EDAM:data_0006", + "attributes": null + } + ], + "categories": [ + "biolink:Disease", + "biolink:DiseaseOrPhenotypicFeature", + "biolink:BiologicalEntity", + "biolink:NamedThing" + ], + "name": "heritable pulmonary arterial hypertension" + } + }, + "edges": { + "044a7916-fba9-4b4f-ae48-f0815b0b222d": { + "subject": "NCBIGene:9496", + "object": "MONDO:0017148", + "predicate": "biolink:related_to", + "attributes": null + }, + "a8575c4e-61a6-428a-bf09-fcb3e8d1644d": { + "subject": "NCBIGene:9496", + "object": "MONDO:0005002", + "predicate": "biolink:related_to", + "attributes": null + } + } + }, + "results": [ + { + "node_bindings": { + "n1": [ + { + "id": "NCBIGene:9496" + } + ], + "n3": [ + { + "id": "MONDO:0005002" + } + ] + }, + "edge_bindings": { + "e1": [ + { + "id": "a8575c4e-61a6-428a-bf09-fcb3e8d1644d", + "attributes": null + } + ] + }, + "score": null + }, + { + "node_bindings": { + "n2": [ + { + "id": "NCBIGene:9496" + } + ], + "n4": [ + { + "id": "MONDO:0005002" + } + ] + }, + "edge_bindings": { + "e2": [ + { + "id": "a8575c4e-61a6-428a-bf09-fcb3e8d1644d", + "attributes": null + } + ] + }, + "score": null + }, + { + "node_bindings": { + "n1": [ + { + "id": "NCBIGene:9496" + } + ], + "n5": [ + { + "id": "MONDO:0017148" + } + ] + }, + "edge_bindings": { + "e3": [ + { + "id": "044a7916-fba9-4b4f-ae48-f0815b0b222d", + "attributes": null + } + ] + }, + "score": null + } + ] + }, + "logs": null, + "status": null +} diff --git a/operations/normalize_nodes.yml b/operations/normalize_nodes.yml new file mode 100644 index 0000000..a853f60 --- /dev/null +++ b/operations/normalize_nodes.yml @@ -0,0 +1,16 @@ +id: normalize_nodes +name: normalize nodes +description: This operation updates the identifiers on qgraph and kgraph nodes to their preferred identifiers, and adds equivalent identifiers in a property for knodes. When two kgraph nodes normalize to the same preferred identifier, the two knodes are merged. The new node contain the union of the properties of the two original nodes. All edges attached to either of the two original nodes are now subsequently attached to the new merged knode. Qnodes are not merged, so that the structure of the query can be preserved. The updates to kgraph node identifiers also necessitates the updating of result node bindings. +unique: false +examples: +- input: normalize_nodes/messages/01_prenormalized_message.json +- output: normalize_nodes/messages/02_postnormalized_message.json +input_requirements: [] +output_guarantees: [] +allowed_changes: +- modify qnodes +- modify knodes +- remove knodes +- modify kedges +- modify node bindings +parameters: {} diff --git a/requirements.txt b/requirements.txt index 6b02114..fa8e814 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +#jinja installs markupsafe w/o a version, new versions fail. +MarkupSafe<2.1.0 jinja2==2.11.3 jsonschema==3.2.0 pytest==6.2.3 diff --git a/schema/operation.json b/schema/operation.json index 05b5167..b62ed26 100644 --- a/schema/operation.json +++ b/schema/operation.json @@ -1,6 +1,6 @@ { "$schema": "https://json-schema.org/draft-07/schema", - "$id": "https://standards.ncats.io/operation/1.3.2/schema", + "$id": "https://standards.ncats.io/operation/1.4.2/schema", "anyOf": [ { "$ref": "#/$defs/OperationAnnotate" @@ -59,6 +59,9 @@ { "$ref": "#/$defs/OperationLookupAndScore" }, + { + "$ref": "#/$defs/OperationNormalizeNodes" + }, { "$ref": "#/$defs/OperationOverlay" }, @@ -595,7 +598,8 @@ }, "description": "List of operation providers (by infores ID) that may be used to complete operation. No others will be used. A full list of operation providers for each operation with infores ID's is available through the '/services' endpoint of the workflow runner.", "example": [ - "infores:aragorn" ], + "infores:aragorn" + ], "minLength": 1 } } @@ -1496,6 +1500,59 @@ ], "additionalProperties": false }, + "OperationNormalizeNodes": { + "type": "object", + "description": "This operation updates the identifiers on qgraph and kgraph nodes to their preferred identifiers, and adds equivalent identifiers in a property for knodes. When two kgraph nodes normalize to the same preferred identifier, the two knodes are merged. The new node contain the union of the properties of the two original nodes. All edges attached to either of the two original nodes are now subsequently attached to the new merged knode. Qnodes are not merged, so that the structure of the query can be preserved. The updates to kgraph node identifiers also necessitates the updating of result node bindings.", + "properties": { + "id": { + "type": "string", + "enum": [ + "normalize_nodes" + ] + }, + "unique": false, + "parameters": {}, + "runner_parameters": { + "type": "object", + "oneOf": [ + { + "properties": { + "allowlist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of operation providers (by infores ID) that may be used to complete operation. No others will be used. A full list of operation providers for each operation with infores ID's is available through the '/services' endpoint of the workflow runner.", + "example": [ + "infores:aragorn" + ], + "minLength": 1 + } + } + }, + { + "properties": { + "denylist": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of operation providers (by infores ID) that may not be used to complete operation. All others will be used. A full list of operation providers for each operation with infores ID's is available through the '/services' endpoint of the worflow runner.", + "example": [ + "infores:aragorn" + ], + "minLength": 1 + } + } + } + ] + } + }, + "required": [ + "id" + ], + "additionalProperties": false + }, "OperationOverlay": { "type": "object", "description": "This operation adds additional qedges and/or kedges and/or result edge bindings.", diff --git a/schema/workflow.json b/schema/workflow.json index ecc36ba..f08bb5b 100644 --- a/schema/workflow.json +++ b/schema/workflow.json @@ -1,8 +1,8 @@ { "$schema": "https://json-schema.org/draft-07/schema", - "$id": "https://standards.ncats.io/workflow/1.3.2/schema", + "$id": "https://standards.ncats.io/workflow/1.4.2/schema", "type": "array", "items": { - "$ref": "https://standards.ncats.io/operation/1.3.2/schema" + "$ref": "https://standards.ncats.io/operation/1.4.2/schema" } } \ No newline at end of file