diff --git a/plugins/life-science-research/.codex-plugin/plugin.json b/plugins/life-science-research/.codex-plugin/plugin.json index 40348aa1f..2f09ca53d 100644 --- a/plugins/life-science-research/.codex-plugin/plugin.json +++ b/plugins/life-science-research/.codex-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "life-science-research", - "version": "1.0.3", + "version": "1.0.4", "description": "General life-sciences research workflows with query routing, evidence synthesis, and optional parallel subagent analysis across genetics, omics, biology, chemistry, structure, clinical evidence, and public dataset discovery.", "author": { "name": "OpenAI" diff --git a/plugins/life-science-research/README.md b/plugins/life-science-research/README.md index 9833b9817..90075aaa9 100644 --- a/plugins/life-science-research/README.md +++ b/plugins/life-science-research/README.md @@ -23,6 +23,19 @@ When a user invokes this plugin, treat it as a general research copilot for life 6. Synthesize for the user. Return a concise research answer with the key evidence, important caveats, and clear next steps. Save raw payloads only when the user asks for them. +## Source Presentation + +Every skill follows a shared, output-aware source-presentation contract: + +- put source links next to substantive externally sourced claims rather than relying on a trailing list of databases checked; +- render stable publication IDs, accessions, trial IDs, variant IDs, pathway IDs, structure IDs, and dataset IDs as links to authoritative records when a canonical URL is available; +- fall back to sanitized request URLs when a source has no stable human-readable record page; +- retain provenance without forcing evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or queried sources that returned no supporting evidence; +- never invent a deep link or imply that a source supports information absent from its response; and +- preserve explicitly requested raw or machine-readable output without injecting Markdown. + +The full runtime policy is in `references/source-presentation.md`, and source-specific display names and URL templates are in `references/source-links.json`. Script-backed skills expose additive `sources` metadata so the selected skill and `research-router-skill` can use consistent provenance in user-facing summaries. + ## Research Patterns This plugin is meant to support workflows like: @@ -173,3 +186,5 @@ Each subagent should receive a bounded objective and return concise findings, ca The plugin does not require plugin-local app connectors or MCP servers. The bundled skills are self-contained under `plugins/life-science-research/skills/` and generally call their own scripts or public APIs directly. This plugin should be treated as a routing and synthesis layer over those skills. A focused question may require only one skill. A broader research question may require a short multi-skill chain, and when the work splits naturally into independent lanes, optional subagent-assisted parallel analysis before final synthesis. + +Run `python scripts/validate_source_presentation.py` from the plugin directory to verify that all skills are covered by the source registry, load the runtime contract, and expose provenance from script-backed workflows. diff --git a/plugins/life-science-research/references/source-links.json b/plugins/life-science-research/references/source-links.json new file mode 100644 index 000000000..d31e1289b --- /dev/null +++ b/plugins/life-science-research/references/source-links.json @@ -0,0 +1,363 @@ +{ + "schema_version": 1, + "fallback_order": [ + "canonical_record_url", + "sanitized_request_url", + "authoritative_homepage_without_linking_the_identifier" + ], + "skills": { + "alphafold-skill": { + "source_name": "AlphaFold Protein Structure Database", + "homepage_url": "https://alphafold.ebi.ac.uk/", + "record_url_templates": [ + {"identifier_type": "UniProt accession", "template": "https://alphafold.ebi.ac.uk/entry/{id}"} + ] + }, + "bgee-skill": { + "source_name": "Bgee", + "homepage_url": "https://www.bgee.org/", + "record_url_templates": [] + }, + "bindingdb-skill": { + "source_name": "BindingDB", + "homepage_url": "https://www.bindingdb.org/", + "record_url_templates": [] + }, + "biobankjapan-phewas-skill": { + "source_name": "BioBank Japan PheWeb", + "homepage_url": "https://pheweb.jp/", + "record_url_templates": [ + {"identifier_type": "variant", "template": "https://pheweb.jp/variant/{id}"}, + {"identifier_type": "phenotype", "template": "https://pheweb.jp/pheno/{id}"} + ] + }, + "biorxiv-skill": { + "source_name": "bioRxiv and medRxiv", + "homepage_url": "https://www.biorxiv.org/", + "record_url_templates": [ + {"identifier_type": "DOI", "template": "https://doi.org/{id}"} + ] + }, + "biostudies-arrayexpress-skill": { + "source_name": "BioStudies and ArrayExpress", + "homepage_url": "https://www.ebi.ac.uk/biostudies/", + "record_url_templates": [ + {"identifier_type": "study accession", "template": "https://www.ebi.ac.uk/biostudies/studies/{id}"} + ] + }, + "cbioportal-skill": { + "source_name": "cBioPortal", + "homepage_url": "https://www.cbioportal.org/", + "record_url_templates": [] + }, + "cellxgene-skill": { + "source_name": "CZ CELLxGENE Discover", + "homepage_url": "https://cellxgene.cziscience.com/", + "record_url_templates": [ + {"identifier_type": "collection ID", "template": "https://cellxgene.cziscience.com/collections/{id}"} + ] + }, + "chebi-skill": { + "source_name": "ChEBI", + "homepage_url": "https://www.ebi.ac.uk/chebi/", + "record_url_templates": [ + {"identifier_type": "ChEBI ID", "template": "https://www.ebi.ac.uk/chebi/searchId.do?chebiId={id}"} + ] + }, + "chembl-skill": { + "source_name": "ChEMBL", + "homepage_url": "https://www.ebi.ac.uk/chembl/", + "record_url_templates": [ + {"identifier_type": "compound ID", "template": "https://www.ebi.ac.uk/chembl/explore/compound/{id}"}, + {"identifier_type": "target ID", "template": "https://www.ebi.ac.uk/chembl/explore/target/{id}"} + ] + }, + "civic-skill": { + "source_name": "CIViC", + "homepage_url": "https://civicdb.org/", + "record_url_templates": [ + {"identifier_type": "variant ID", "template": "https://civicdb.org/variants/{id}/summary"}, + {"identifier_type": "evidence ID", "template": "https://civicdb.org/evidence/{id}/summary"} + ] + }, + "clinicaltrials-skill": { + "source_name": "ClinicalTrials.gov", + "homepage_url": "https://clinicaltrials.gov/", + "record_url_templates": [ + {"identifier_type": "NCT ID", "template": "https://clinicaltrials.gov/study/{id}"} + ] + }, + "clinvar-variation-skill": { + "source_name": "ClinVar and NCBI Variation", + "homepage_url": "https://www.ncbi.nlm.nih.gov/clinvar/", + "record_url_templates": [ + {"identifier_type": "numeric ClinVar Variation ID", "template": "https://www.ncbi.nlm.nih.gov/clinvar/variation/{id}/", "transform": "For a VCV accession, strip the VCV prefix and leading zeros before substitution."}, + {"identifier_type": "RefSNP ID", "template": "https://www.ncbi.nlm.nih.gov/snp/{id}"} + ] + }, + "efo-ontology-skill": { + "source_name": "Experimental Factor Ontology via OLS", + "homepage_url": "https://www.ebi.ac.uk/ols4/ontologies/efo", + "record_url_templates": [] + }, + "encode-skill": { + "source_name": "ENCODE", + "homepage_url": "https://www.encodeproject.org/", + "record_url_templates": [ + {"identifier_type": "ENCODE accession", "template": "https://www.encodeproject.org/{id}/"} + ] + }, + "ensembl-skill": { + "source_name": "Ensembl", + "homepage_url": "https://www.ensembl.org/", + "record_url_templates": [ + {"identifier_type": "stable Ensembl ID", "template": "https://www.ensembl.org/id/{id}"} + ] + }, + "epigraphdb-skill": { + "source_name": "EpiGraphDB", + "homepage_url": "https://epigraphdb.org/", + "record_url_templates": [] + }, + "eqtl-catalogue-skill": { + "source_name": "eQTL Catalogue", + "homepage_url": "https://www.ebi.ac.uk/eqtl/", + "record_url_templates": [] + }, + "eva-skill": { + "source_name": "European Variation Archive", + "homepage_url": "https://www.ebi.ac.uk/eva/", + "record_url_templates": [] + }, + "finngen-phewas-skill": { + "source_name": "FinnGen PheWeb", + "homepage_url": "https://r12.finngen.fi/", + "record_url_templates": [ + {"identifier_type": "variant", "template": "https://r12.finngen.fi/variant/{id}"}, + {"identifier_type": "phenotype", "template": "https://r12.finngen.fi/pheno/{id}"} + ] + }, + "genebass-gene-burden-skill": { + "source_name": "Genebass", + "homepage_url": "https://app.genebass.org/", + "record_url_templates": [] + }, + "gnomad-graphql-skill": { + "source_name": "gnomAD", + "homepage_url": "https://gnomad.broadinstitute.org/", + "record_url_templates": [ + {"identifier_type": "variant", "template": "https://gnomad.broadinstitute.org/variant/{id}"}, + {"identifier_type": "gene ID", "template": "https://gnomad.broadinstitute.org/gene/{id}"} + ] + }, + "gtex-eqtl-skill": { + "source_name": "GTEx Portal", + "homepage_url": "https://gtexportal.org/", + "record_url_templates": [ + {"identifier_type": "gene ID", "template": "https://gtexportal.org/home/gene/{id}"} + ] + }, + "gwas-catalog-skill": { + "source_name": "NHGRI-EBI GWAS Catalog", + "homepage_url": "https://www.ebi.ac.uk/gwas/", + "record_url_templates": [ + {"identifier_type": "study accession", "template": "https://www.ebi.ac.uk/gwas/studies/{id}"}, + {"identifier_type": "association accession", "template": "https://www.ebi.ac.uk/gwas/associations/{id}"}, + {"identifier_type": "variant", "template": "https://www.ebi.ac.uk/gwas/variants/{id}"} + ] + }, + "hmdb-skill": { + "source_name": "Human Metabolome Database", + "homepage_url": "https://hmdb.ca/", + "record_url_templates": [ + {"identifier_type": "HMDB ID", "template": "https://hmdb.ca/metabolites/{id}"} + ] + }, + "human-protein-atlas-skill": { + "source_name": "Human Protein Atlas", + "homepage_url": "https://www.proteinatlas.org/", + "record_url_templates": [ + {"identifier_type": "gene symbol or Ensembl gene ID", "template": "https://www.proteinatlas.org/{id}"} + ] + }, + "ipd-skill": { + "source_name": "IPD-IMGT/HLA", + "homepage_url": "https://www.ebi.ac.uk/ipd/imgt/hla/", + "record_url_templates": [] + }, + "locus-to-gene-mapper-skill": { + "source_name": "Life Science Research locus-to-gene synthesis", + "homepage_url": null, + "record_url_templates": [], + "fallback": "downstream_sources" + }, + "metabolights-skill": { + "source_name": "MetaboLights", + "homepage_url": "https://www.ebi.ac.uk/metabolights/", + "record_url_templates": [ + {"identifier_type": "MetaboLights study accession", "template": "https://www.ebi.ac.uk/metabolights/{id}"} + ] + }, + "mgnify-skill": { + "source_name": "MGnify", + "homepage_url": "https://www.ebi.ac.uk/metagenomics/", + "record_url_templates": [ + {"identifier_type": "study accession", "template": "https://www.ebi.ac.uk/metagenomics/studies/{id}"}, + {"identifier_type": "analysis accession", "template": "https://www.ebi.ac.uk/metagenomics/analyses/{id}"}, + {"identifier_type": "sample accession", "template": "https://www.ebi.ac.uk/metagenomics/samples/{id}"} + ] + }, + "ncbi-blast-skill": { + "source_name": "NCBI BLAST", + "homepage_url": "https://blast.ncbi.nlm.nih.gov/Blast.cgi", + "record_url_templates": [ + {"identifier_type": "BLAST RID", "template": "https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Get&RID={id}"} + ] + }, + "ncbi-clinicaltables-skill": { + "source_name": "NCBI Gene via Clinical Tables", + "homepage_url": "https://clinicaltables.nlm.nih.gov/", + "record_url_templates": [ + {"identifier_type": "NCBI Gene ID", "template": "https://www.ncbi.nlm.nih.gov/gene/{id}"} + ] + }, + "ncbi-datasets-skill": { + "source_name": "NCBI Datasets", + "homepage_url": "https://www.ncbi.nlm.nih.gov/datasets/", + "record_url_templates": [ + {"identifier_type": "genome accession", "template": "https://www.ncbi.nlm.nih.gov/datasets/genome/{id}/"}, + {"identifier_type": "NCBI Gene ID", "template": "https://www.ncbi.nlm.nih.gov/gene/{id}"} + ] + }, + "ncbi-entrez-skill": { + "source_name": "NCBI Entrez", + "homepage_url": "https://www.ncbi.nlm.nih.gov/search/", + "record_url_templates": [ + {"identifier_type": "PMID", "template": "https://pubmed.ncbi.nlm.nih.gov/{id}/"}, + {"identifier_type": "DOI", "template": "https://doi.org/{id}"}, + {"identifier_type": "NCBI Gene ID", "template": "https://www.ncbi.nlm.nih.gov/gene/{id}"}, + {"identifier_type": "GEO accession", "template": "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={id}"} + ] + }, + "ncbi-pmc-skill": { + "source_name": "PubMed Central", + "homepage_url": "https://pmc.ncbi.nlm.nih.gov/", + "record_url_templates": [ + {"identifier_type": "PMCID", "template": "https://pmc.ncbi.nlm.nih.gov/articles/{id}/"} + ] + }, + "opentargets-skill": { + "source_name": "Open Targets Platform", + "homepage_url": "https://platform.opentargets.org/", + "record_url_templates": [ + {"identifier_type": "target ID", "template": "https://platform.opentargets.org/target/{id}"}, + {"identifier_type": "disease ID", "template": "https://platform.opentargets.org/disease/{id}"} + ] + }, + "pharmgkb-skill": { + "source_name": "PharmGKB", + "homepage_url": "https://www.pharmgkb.org/", + "record_url_templates": [ + {"identifier_type": "chemical ID", "template": "https://www.pharmgkb.org/chemical/{id}"}, + {"identifier_type": "gene ID", "template": "https://www.pharmgkb.org/gene/{id}"}, + {"identifier_type": "variant ID", "template": "https://www.pharmgkb.org/variant/{id}"} + ] + }, + "pride-skill": { + "source_name": "PRIDE Archive", + "homepage_url": "https://www.ebi.ac.uk/pride/archive/", + "record_url_templates": [ + {"identifier_type": "PRIDE project accession", "template": "https://www.ebi.ac.uk/pride/archive/projects/{id}"} + ] + }, + "proteomexchange-skill": { + "source_name": "ProteomeXchange", + "homepage_url": "https://www.proteomexchange.org/", + "record_url_templates": [ + {"identifier_type": "ProteomeXchange accession", "template": "https://proteomecentral.proteomexchange.org/cgi/GetDataset?ID={id}"} + ] + }, + "pubchem-pug-skill": { + "source_name": "PubChem", + "homepage_url": "https://pubchem.ncbi.nlm.nih.gov/", + "record_url_templates": [ + {"identifier_type": "compound CID", "template": "https://pubchem.ncbi.nlm.nih.gov/compound/{id}"}, + {"identifier_type": "substance SID", "template": "https://pubchem.ncbi.nlm.nih.gov/substance/{id}"}, + {"identifier_type": "assay AID", "template": "https://pubchem.ncbi.nlm.nih.gov/bioassay/{id}"} + ] + }, + "quickgo-skill": { + "source_name": "QuickGO", + "homepage_url": "https://www.ebi.ac.uk/QuickGO/", + "record_url_templates": [ + {"identifier_type": "GO term", "template": "https://www.ebi.ac.uk/QuickGO/term/{id}"} + ] + }, + "rcsb-pdb-skill": { + "source_name": "RCSB Protein Data Bank", + "homepage_url": "https://www.rcsb.org/", + "record_url_templates": [ + {"identifier_type": "PDB ID", "template": "https://www.rcsb.org/structure/{id}"} + ] + }, + "reactome-skill": { + "source_name": "Reactome", + "homepage_url": "https://reactome.org/", + "record_url_templates": [ + {"identifier_type": "Reactome stable ID", "template": "https://reactome.org/content/detail/{id}"} + ] + }, + "research-router-skill": { + "source_name": "Downstream evidence sources", + "homepage_url": null, + "record_url_templates": [], + "fallback": "downstream_sources" + }, + "rhea-skill": { + "source_name": "Rhea", + "homepage_url": "https://www.rhea-db.org/", + "record_url_templates": [ + {"identifier_type": "Rhea numeric ID", "template": "https://www.rhea-db.org/rhea/{id}", "transform": "Strip the RHEA: prefix before substitution."} + ] + }, + "rnacentral-skill": { + "source_name": "RNAcentral", + "homepage_url": "https://rnacentral.org/", + "record_url_templates": [ + {"identifier_type": "RNAcentral ID", "template": "https://rnacentral.org/rna/{id}"} + ] + }, + "string-skill": { + "source_name": "STRING", + "homepage_url": "https://string-db.org/", + "record_url_templates": [ + {"identifier_type": "STRING protein identifier", "template": "https://string-db.org/network/{id}"} + ] + }, + "tpmi-phewas-skill": { + "source_name": "Taiwan Precision Medicine Initiative PheWeb", + "homepage_url": "https://pheweb.ibms.sinica.edu.tw/", + "record_url_templates": [ + {"identifier_type": "variant", "template": "https://pheweb.ibms.sinica.edu.tw/variant/{id}"}, + {"identifier_type": "phenotype", "template": "https://pheweb.ibms.sinica.edu.tw/pheno/{id}"} + ] + }, + "ukb-topmed-phewas-skill": { + "source_name": "UKB-TOPMed PheWeb", + "homepage_url": "https://pheweb.org/UKB-TOPMed/", + "record_url_templates": [ + {"identifier_type": "variant", "template": "https://pheweb.org/UKB-TOPMed/variant/{id}"}, + {"identifier_type": "phenotype", "template": "https://pheweb.org/UKB-TOPMed/pheno/{id}"} + ] + }, + "uniprot-skill": { + "source_name": "UniProt", + "homepage_url": "https://www.uniprot.org/", + "record_url_templates": [ + {"identifier_type": "UniProtKB accession", "template": "https://www.uniprot.org/uniprotkb/{id}/entry"}, + {"identifier_type": "UniRef cluster", "template": "https://www.uniprot.org/uniref/{id}"}, + {"identifier_type": "UniParc accession", "template": "https://www.uniprot.org/uniparc/{id}/entry"} + ] + } + } +} diff --git a/plugins/life-science-research/references/source-presentation.md b/plugins/life-science-research/references/source-presentation.md new file mode 100644 index 000000000..cb6c0081e --- /dev/null +++ b/plugins/life-science-research/references/source-presentation.md @@ -0,0 +1,77 @@ +# Source Presentation Contract + +This contract applies to every user-facing answer produced with the Life Science +Research plugin. Its governing rule is: **every substantive externally sourced +claim should remain traceable, but not every skill invocation needs a clickable +evidence link.** Keep structured provenance when it is available, then choose +the presentation that matches what the response actually contains. + +## Choose the presentation by output mode + +| Output mode | User-facing source behavior | +| --- | --- | +| Record or evidence lookup | Put the most specific authoritative link next to the claim or tightly related claim cluster it supports. | +| Search or result list | Link the reproducible query when useful and link only the individual records discussed materially; do not link every returned row by default. | +| Connectivity or schema check | Retain endpoint provenance in structured output or an optional source note, but do not present the endpoint as scientific evidence. | +| Source metadata or service status | Attribute the source once when the metadata matters. Do not invent a record link merely to make the source clickable. | +| Empty result or failed request | Name the attempted source and report the empty result or failure clearly. Do not imply evidentiary support or construct an unsupported record link. | +| Router or planner | Do not add example citations for the routing decision itself. Propagate only sources actually returned by downstream evidence work. | +| Local synthesis or derived analysis | Cite inputs that contributed evidence. Keep queried-but-empty sources in methods, provenance, or limitations rather than attaching them to a result claim. | +| Raw machine-readable output | Preserve the requested payload without injecting Markdown. Keep provenance outside the raw payload when needed. | + +## Required behavior + +1. For every substantive externally sourced claim, put a source link next to + the claim or claim cluster it supports. A trailing list of databases that + were checked is useful for completeness, but it does not replace + claim-adjacent attribution when an evidence claim was made. +2. When an authoritative public record URL can be constructed, render stable + publication IDs, accessions, trial IDs, variant IDs, pathway IDs, structure + IDs, and dataset IDs as Markdown links instead of bare identifiers. +3. Prefer the source's canonical human-readable record page. If the source has + no stable record page, use the sanitized request URL returned by the skill. + When only an authoritative source URL is available, link the source name but + leave the identifier as plain text. If none is available, name the source and + identifier without inventing a link. +4. Use the `sources` entries returned by scripts as the provenance baseline. + Preserve them even when an output mode does not call for an inline evidence + link. Do not claim that a source supports information absent from its + response. +5. Keep link density readable. One citation may support a tightly related group + of claims, but each materially different evidence claim needs its own source. +6. Preserve raw JSON, XML, FASTA, CSV, and other explicitly requested + machine-readable output. Do not inject Markdown into raw payloads. +7. Do not force a claim-adjacent link for routing, connectivity, schema-only, + empty, or failed work. If useful, identify these sources in a concise + `Sources checked`, methods, provenance, or limitations note. +8. Deduplicate repeated links in an optional final `Sources` list while keeping + applicable claim-adjacent links in the body. + +## Source-specific rules + +Use the current skill's entry in `source-links.json` for display names, +authoritative home pages, canonical record URL templates, and fallback behavior. +Only use a template when its identifier type and required fields match the +record at hand. URL-encode substituted values and apply any transformation +named by the template. When no matching template exists, prefer the script's +sanitized `request_url`, then its authoritative `url` without presenting that +URL as a record-specific link. + +## Synthesis rules + +When several skills contribute to one answer: + +- keep each applicable citation attached to the claim derived from that source; +- preserve source-specific disagreements instead of merging them into a single + unsupported statement; +- link the primary record rather than a generic database home page whenever + possible; +- distinguish evidence-contributing sources from sources that were queried but + returned no relevant evidence; and +- do not create new evidence links that were not supplied by a downstream skill + or defined in `source-links.json`. + +The router itself does not seed an answer with sample citations. If downstream +protein and pathway lookups return substantive records, attach each returned +record link to the corresponding protein-function or pathway claim instead of +listing bare identifiers only at the end. diff --git a/plugins/life-science-research/scripts/validate_source_presentation.py b/plugins/life-science-research/scripts/validate_source_presentation.py new file mode 100644 index 000000000..2166eaa1f --- /dev/null +++ b/plugins/life-science-research/scripts/validate_source_presentation.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +"""Validate source-link coverage and provenance contracts for this plugin.""" + +from __future__ import annotations + +import ast +import json +from pathlib import Path +from urllib.parse import urlsplit + +PLUGIN_ROOT = Path(__file__).resolve().parents[1] +SKILLS_DIR = PLUGIN_ROOT / "skills" +REGISTRY_PATH = PLUGIN_ROOT / "references" / "source-links.json" +CONTRACT_PATH = PLUGIN_ROOT / "references" / "source-presentation.md" +CONTRACT_MARKER = "" + + +def _is_https_url(value: object) -> bool: + if not isinstance(value, str): + return False + parts = urlsplit(value) + return parts.scheme == "https" and bool(parts.netloc) + + +def validate() -> list[str]: + errors: list[str] = [] + try: + registry = json.loads(REGISTRY_PATH.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError) as exc: + return [f"Could not read source registry: {exc}"] + + try: + contract_text = CONTRACT_PATH.read_text(encoding="utf-8") + except OSError as exc: + errors.append(f"Could not read source presentation contract: {exc}") + contract_text = "" + + contract_normalized = " ".join(contract_text.split()) + for required_phrase in ( + "every substantive externally sourced claim should remain traceable", + "Connectivity or schema check", + "Empty result or failed request", + "Router or planner", + "Local synthesis or derived analysis", + "queried but returned no relevant evidence", + ): + if required_phrase not in contract_normalized: + errors.append(f"source-presentation.md: missing rule: {required_phrase}") + + skill_dirs = sorted(path for path in SKILLS_DIR.iterdir() if path.is_dir()) + skill_names = {path.name for path in skill_dirs} + registry_skills = registry.get("skills") + if not isinstance(registry_skills, dict): + return errors + ["source-links.json must contain a `skills` object"] + + registry_names = set(registry_skills) + for missing in sorted(skill_names - registry_names): + errors.append(f"Registry is missing skill: {missing}") + for unknown in sorted(registry_names - skill_names): + errors.append(f"Registry contains unknown skill: {unknown}") + + for skill_dir in skill_dirs: + skill_name = skill_dir.name + skill_path = skill_dir / "SKILL.md" + try: + skill_text = skill_path.read_text(encoding="utf-8") + except OSError as exc: + errors.append(f"Could not read {skill_path}: {exc}") + continue + + if CONTRACT_MARKER not in skill_text: + errors.append(f"{skill_name}: missing source presentation marker") + for required_phrase in ( + "only for substantive external claims supported by the response", + "Do not force evidence links for connectivity or schema checks", + ): + if required_phrase not in skill_text: + errors.append( + f"{skill_name}: missing conditional rule: {required_phrase}" + ) + expected_registry_line = ( + f"Use the `{skill_name}` entry in `../../references/source-links.json`" + ) + if expected_registry_line not in skill_text: + errors.append(f"{skill_name}: missing or incorrect registry key") + + entry = registry_skills.get(skill_name) + if not isinstance(entry, dict): + continue + if ( + not isinstance(entry.get("source_name"), str) + or not entry["source_name"].strip() + ): + errors.append(f"{skill_name}: source_name must be a non-empty string") + homepage = entry.get("homepage_url") + if homepage is not None and not _is_https_url(homepage): + errors.append(f"{skill_name}: homepage_url must be HTTPS or null") + templates = entry.get("record_url_templates") + if not isinstance(templates, list): + errors.append(f"{skill_name}: record_url_templates must be a list") + else: + for index, template_entry in enumerate(templates): + if not isinstance(template_entry, dict): + errors.append(f"{skill_name}: template {index} must be an object") + continue + identifier_type = template_entry.get("identifier_type") + template = template_entry.get("template") + if not isinstance(identifier_type, str) or not identifier_type.strip(): + errors.append( + f"{skill_name}: template {index} lacks identifier_type" + ) + if not _is_https_url(template) or "{id}" not in str(template): + errors.append( + f"{skill_name}: template {index} must be HTTPS and contain {{id}}" + ) + transform = template_entry.get("transform") + if transform is not None and ( + not isinstance(transform, str) or not transform.strip() + ): + errors.append( + f"{skill_name}: template {index} transform must be a non-empty string" + ) + + scripts_dir = skill_dir / "scripts" + if scripts_dir.is_dir(): + script_files = sorted(scripts_dir.glob("*.py")) + runtime_files = [ + path for path in script_files if not path.name.startswith("test_") + ] + combined_runtime_text = "\n".join( + path.read_text(encoding="utf-8") for path in runtime_files + ) + if ( + '"sources"' not in combined_runtime_text + and "_attach_sources" not in combined_runtime_text + ): + errors.append(f"{skill_name}: script outputs lack provenance support") + if "def _sanitize_request_url" in combined_runtime_text: + for required_phrase in ( + 'parts.netloc.rsplit("@", 1)[-1]', + 'urlencode(query), ""', + '"credential"', + '"sig"', + "SOURCE_NAME = ", + "_sources(SOURCE_NAME, str(response.url))", + ): + if required_phrase not in combined_runtime_text: + errors.append( + f"{skill_name}: URL sanitizer missing: {required_phrase}" + ) + for script_path in script_files: + try: + ast.parse( + script_path.read_text(encoding="utf-8"), + filename=str(script_path), + ) + except (OSError, SyntaxError) as exc: + errors.append( + f"{skill_name}: invalid Python in {script_path.name}: {exc}" + ) + + router_text = (SKILLS_DIR / "research-router-skill" / "SKILL.md").read_text( + encoding="utf-8" + ) + router_normalized = " ".join(router_text.split()) + for required_phrase in ( + "retain the downstream skill's `sources` entries", + "Do not seed the router with example citations", + "only performs a connectivity or schema check, returns no evidence, or fails", + "supplements rather than replaces applicable claim-adjacent links", + "the structured `sources` entries returned by those skills", + ): + if required_phrase not in router_normalized: + errors.append(f"research-router-skill: missing rule: {required_phrase}") + + mapper_text = (SKILLS_DIR / "locus-to-gene-mapper-skill" / "SKILL.md").read_text( + encoding="utf-8" + ) + mapper_normalized = " ".join(mapper_text.split()) + for required_phrase in ( + "Distinguish evidence-contributing sources", + "queried sources that returned no mapping evidence", + "queried-but-empty sources in methods, provenance, or limitations", + ): + if required_phrase not in mapper_normalized: + errors.append( + f"locus-to-gene-mapper-skill: missing rule: {required_phrase}" + ) + + return errors + + +def main() -> int: + errors = validate() + if errors: + for item in errors: + print(f"ERROR: {item}") + return 1 + skill_count = len([path for path in SKILLS_DIR.iterdir() if path.is_dir()]) + script_skill_count = len( + [path for path in SKILLS_DIR.iterdir() if (path / "scripts").is_dir()] + ) + print( + f"Source presentation validation passed: {skill_count} skills, " + f"{script_skill_count} script-backed skills." + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/alphafold-skill/SKILL.md b/plugins/life-science-research/skills/alphafold-skill/SKILL.md index 61d66801b..fcde6b04f 100644 --- a/plugins/life-science-research/skills/alphafold-skill/SKILL.md +++ b/plugins/life-science-research/skills/alphafold-skill/SKILL.md @@ -3,6 +3,15 @@ name: alphafold-skill description: Submit compact AlphaFold Protein Structure Database API requests for prediction, UniProt summary, sequence summary, and annotation lookups. Use when a user wants AlphaFold metadata or concise structure summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `alphafold-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all AlphaFold API calls. - Use `base_url=https://alphafold.ebi.ac.uk/api`. @@ -38,4 +47,4 @@ echo '{"base_url":"https://alphafold.ebi.ac.uk/api","path":"prediction/Q5VSL9"}' ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/alphafold-skill/scripts/rest_request.py b/plugins/life-science-research/skills/alphafold-skill/scripts/rest_request.py index 4de3cf105..eb94243dd 100644 --- a/plugins/life-science-research/skills/alphafold-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/alphafold-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "AlphaFold Protein Structure Database" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/bgee-skill/SKILL.md b/plugins/life-science-research/skills/bgee-skill/SKILL.md index 5882216d7..28fcdeae6 100644 --- a/plugins/life-science-research/skills/bgee-skill/SKILL.md +++ b/plugins/life-science-research/skills/bgee-skill/SKILL.md @@ -3,6 +3,15 @@ name: bgee-skill description: Submit compact Bgee SPARQL requests for healthy wild-type expression metadata and ontology-aware lookup patterns. Use when a user wants concise Bgee summaries; save raw results only on request. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `bgee-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/sparql_request.py` for all Bgee SPARQL work. - Start with small `SELECT` or `ASK` queries and add `LIMIT` early. @@ -33,4 +42,4 @@ echo '{"query":"ASK {}"}' | python scripts/sparql_request.py ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/sparql_request.py`. +- Keep runtime imports limited to this file, `scripts/sparql_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/bgee-skill/scripts/sparql_request.py b/plugins/life-science-research/skills/bgee-skill/scripts/sparql_request.py index c1ad18398..99e75cf3c 100644 --- a/plugins/life-science-research/skills/bgee-skill/scripts/sparql_request.py +++ b/plugins/life-science-research/skills/bgee-skill/scripts/sparql_request.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import sys from pathlib import Path @@ -179,6 +180,21 @@ def execute(payload: Any) -> dict[str, Any]: } +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: try: payload = json.load(sys.stdin) @@ -192,7 +208,7 @@ def main() -> int: code = 2 else: code = 0 if output.get("ok") else 1 - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "Bgee", "https://www.bgee.org/sparql/"))) return code diff --git a/plugins/life-science-research/skills/bindingdb-skill/SKILL.md b/plugins/life-science-research/skills/bindingdb-skill/SKILL.md index 8f27873c1..5e51b1171 100644 --- a/plugins/life-science-research/skills/bindingdb-skill/SKILL.md +++ b/plugins/life-science-research/skills/bindingdb-skill/SKILL.md @@ -3,6 +3,15 @@ name: bindingdb-skill description: Submit compact BindingDB REST API requests for ligand-target binding lookups by PDB, UniProt, or similarity search. Use when a user wants concise BindingDB summaries; save raw payloads only on request. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `bindingdb-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all BindingDB API calls. - Use `base_url=https://bindingdb.org`. @@ -35,4 +44,4 @@ echo '{"base_url":"https://bindingdb.org","path":"rest/getLigandsByPDBs","params ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/bindingdb-skill/scripts/rest_request.py b/plugins/life-science-research/skills/bindingdb-skill/scripts/rest_request.py index 4de3cf105..a6a2f56ff 100644 --- a/plugins/life-science-research/skills/bindingdb-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/bindingdb-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "BindingDB" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/biobankjapan-phewas-skill/SKILL.md b/plugins/life-science-research/skills/biobankjapan-phewas-skill/SKILL.md index d0bee3a4a..4bb989a3a 100644 --- a/plugins/life-science-research/skills/biobankjapan-phewas-skill/SKILL.md +++ b/plugins/life-science-research/skills/biobankjapan-phewas-skill/SKILL.md @@ -3,6 +3,15 @@ name: biobankjapan-phewas-skill description: Fetch compact BioBank Japan PheWAS summaries for single variants by accepting rsID, GRCh38, or GRCh37 input and resolving to the required GRCh37 query. Use when a user wants concise BBJ association results for one variant --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `biobankjapan-phewas-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/biobankjapan_phewas.py` for all BioBank Japan PheWAS lookups. - Accept exactly one of `rsid`, `grch37`, `grch38`, or `variant`; resolve to the canonical GRCh37 `chr:pos-ref-alt` query before calling BioBank Japan. @@ -38,4 +47,4 @@ echo '{"grch37":"10:114758349-C-T","max_results":10}' | python scripts/biobankja ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/biobankjapan_phewas.py`. +- Keep runtime imports limited to this file, `scripts/biobankjapan_phewas.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/biobankjapan-phewas-skill/scripts/biobankjapan_phewas.py b/plugins/life-science-research/skills/biobankjapan-phewas-skill/scripts/biobankjapan_phewas.py index 18e1f6519..a8f3f1052 100644 --- a/plugins/life-science-research/skills/biobankjapan-phewas-skill/scripts/biobankjapan_phewas.py +++ b/plugins/life-science-research/skills/biobankjapan-phewas-skill/scripts/biobankjapan_phewas.py @@ -12,6 +12,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import re import sys @@ -112,6 +113,21 @@ def write_raw_json(path: Path, data: Any) -> None: path.write_text(json.dumps(data), encoding="utf-8") +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: warnings: list[str] = [] @@ -182,7 +198,7 @@ def main() -> int: "raw_output_path": None, "warnings": warnings, } - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "BioBank Japan PheWeb", "https://pheweb.jp/"))) return 0 associations = extract_associations(data) @@ -218,7 +234,7 @@ def main() -> int: "raw_output_path": saved_raw_output_path, "warnings": warnings, } - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "BioBank Japan PheWeb", "https://pheweb.jp/"))) return 0 diff --git a/plugins/life-science-research/skills/biorxiv-skill/SKILL.md b/plugins/life-science-research/skills/biorxiv-skill/SKILL.md index d8e657a5f..69fffc672 100644 --- a/plugins/life-science-research/skills/biorxiv-skill/SKILL.md +++ b/plugins/life-science-research/skills/biorxiv-skill/SKILL.md @@ -3,6 +3,15 @@ name: biorxiv-skill description: Submit compact bioRxiv and medRxiv API requests for details, publication-linkage, and DOI lookups. Use when a user wants concise preprint metadata summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `biorxiv-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all bioRxiv and medRxiv API calls. - Use `base_url=https://api.biorxiv.org`. @@ -37,4 +46,4 @@ echo '{"base_url":"https://api.biorxiv.org","path":"details/biorxiv/2025-03-21/2 ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/biorxiv-skill/scripts/rest_request.py b/plugins/life-science-research/skills/biorxiv-skill/scripts/rest_request.py index 4de3cf105..045821a21 100644 --- a/plugins/life-science-research/skills/biorxiv-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/biorxiv-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "bioRxiv and medRxiv" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/biostudies-arrayexpress-skill/SKILL.md b/plugins/life-science-research/skills/biostudies-arrayexpress-skill/SKILL.md index 2e9d0c946..cf6236316 100644 --- a/plugins/life-science-research/skills/biostudies-arrayexpress-skill/SKILL.md +++ b/plugins/life-science-research/skills/biostudies-arrayexpress-skill/SKILL.md @@ -3,6 +3,15 @@ name: biostudies-arrayexpress-skill description: Submit compact BioStudies and ArrayExpress API requests for free-text search and accession-based study retrieval. Use when a user wants concise BioStudies summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `biostudies-arrayexpress-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all BioStudies and ArrayExpress calls. - Use `base_url=https://www.ebi.ac.uk/biostudies/api/v1`. @@ -35,4 +44,4 @@ echo '{"base_url":"https://www.ebi.ac.uk/biostudies/api/v1","path":"search","par ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/biostudies-arrayexpress-skill/scripts/rest_request.py b/plugins/life-science-research/skills/biostudies-arrayexpress-skill/scripts/rest_request.py index 4de3cf105..73db6fde3 100644 --- a/plugins/life-science-research/skills/biostudies-arrayexpress-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/biostudies-arrayexpress-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "BioStudies and ArrayExpress" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/cbioportal-skill/SKILL.md b/plugins/life-science-research/skills/cbioportal-skill/SKILL.md index ed882d89b..4b254917c 100644 --- a/plugins/life-science-research/skills/cbioportal-skill/SKILL.md +++ b/plugins/life-science-research/skills/cbioportal-skill/SKILL.md @@ -3,6 +3,15 @@ name: cbioportal-skill description: Submit compact cBioPortal API requests for studies, molecular profiles, mutations, clinical data, and samples. Use when a user wants concise cBioPortal summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `cbioportal-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all cBioPortal API calls. - Use `base_url=https://www.cbioportal.org/api`. @@ -36,4 +45,4 @@ echo '{"base_url":"https://www.cbioportal.org/api","path":"studies","params":{"k ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/cbioportal-skill/scripts/rest_request.py b/plugins/life-science-research/skills/cbioportal-skill/scripts/rest_request.py index 4de3cf105..b55adb786 100644 --- a/plugins/life-science-research/skills/cbioportal-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/cbioportal-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "cBioPortal" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/cellxgene-skill/SKILL.md b/plugins/life-science-research/skills/cellxgene-skill/SKILL.md index 688593cb3..f3d3f3e12 100644 --- a/plugins/life-science-research/skills/cellxgene-skill/SKILL.md +++ b/plugins/life-science-research/skills/cellxgene-skill/SKILL.md @@ -3,6 +3,15 @@ name: cellxgene-skill description: Submit compact CELLxGENE Discover API requests for public collection and dataset metadata. Use when a user wants concise single-cell collection summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `cellxgene-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all CELLxGENE Discover calls. - Use `base_url=https://api.cellxgene.cziscience.com/curation/v1`. @@ -34,4 +43,4 @@ echo '{"base_url":"https://api.cellxgene.cziscience.com/curation/v1","path":"col ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/cellxgene-skill/scripts/rest_request.py b/plugins/life-science-research/skills/cellxgene-skill/scripts/rest_request.py index 4de3cf105..1bdcc78b6 100644 --- a/plugins/life-science-research/skills/cellxgene-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/cellxgene-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "CZ CELLxGENE Discover" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/chebi-skill/SKILL.md b/plugins/life-science-research/skills/chebi-skill/SKILL.md index c2b0300cd..c7163726a 100644 --- a/plugins/life-science-research/skills/chebi-skill/SKILL.md +++ b/plugins/life-science-research/skills/chebi-skill/SKILL.md @@ -3,6 +3,15 @@ name: chebi-skill description: Submit compact ChEBI 2.0 API requests for chemical search, compound lookup, ontology traversal, and structure metadata. Use when a user wants concise ChEBI summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `chebi-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all ChEBI calls. - Use `base_url=https://www.ebi.ac.uk`. @@ -35,4 +44,4 @@ echo '{"base_url":"https://www.ebi.ac.uk","path":"chebi/backend/api/public/es_se ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/chebi-skill/scripts/rest_request.py b/plugins/life-science-research/skills/chebi-skill/scripts/rest_request.py index 4de3cf105..f7be5dde1 100644 --- a/plugins/life-science-research/skills/chebi-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/chebi-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "ChEBI" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/chembl-skill/SKILL.md b/plugins/life-science-research/skills/chembl-skill/SKILL.md index d5518e264..8562ae0da 100644 --- a/plugins/life-science-research/skills/chembl-skill/SKILL.md +++ b/plugins/life-science-research/skills/chembl-skill/SKILL.md @@ -3,6 +3,15 @@ name: chembl-skill description: Submit compact ChEMBL API requests for activity, molecule, target, mechanism, and text-search endpoints. Use when a user wants concise ChEMBL summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `chembl-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all ChEMBL API calls. - Use `base_url=https://www.ebi.ac.uk/chembl/api/data`. @@ -37,4 +46,4 @@ echo '{"base_url":"https://www.ebi.ac.uk/chembl/api/data","path":"activity.json" ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/chembl-skill/scripts/rest_request.py b/plugins/life-science-research/skills/chembl-skill/scripts/rest_request.py index 4de3cf105..70192f90d 100644 --- a/plugins/life-science-research/skills/chembl-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/chembl-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "ChEMBL" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/civic-skill/SKILL.md b/plugins/life-science-research/skills/civic-skill/SKILL.md index 91801c24d..53d8b3b6b 100644 --- a/plugins/life-science-research/skills/civic-skill/SKILL.md +++ b/plugins/life-science-research/skills/civic-skill/SKILL.md @@ -3,6 +3,15 @@ name: civic-skill description: Submit compact CIViC GraphQL requests for cancer variant interpretation schema inspection and targeted evidence retrieval. Use when a user wants concise CIViC summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `civic-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/civic_graphql.py` for all CIViC GraphQL work. - Keep selection sets narrow and start with schema or targeted entity queries. @@ -32,4 +41,4 @@ echo '{"query":"query { __typename }"}' | python scripts/civic_graphql.py ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/civic_graphql.py`. +- Keep runtime imports limited to this file, `scripts/civic_graphql.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/civic-skill/scripts/civic_graphql.py b/plugins/life-science-research/skills/civic-skill/scripts/civic_graphql.py index 5f9dea76e..1015602cc 100644 --- a/plugins/life-science-research/skills/civic-skill/scripts/civic_graphql.py +++ b/plugins/life-science-research/skills/civic-skill/scripts/civic_graphql.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import sys from pathlib import Path @@ -133,6 +134,21 @@ def execute(payload: Any) -> dict[str, Any]: } +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: try: payload = json.load(sys.stdin) @@ -146,7 +162,7 @@ def main() -> int: code = 2 else: code = 0 if output.get("ok") else 1 - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "CIViC", "https://civicdb.org/api/graphql"))) return code diff --git a/plugins/life-science-research/skills/clinicaltrials-skill/SKILL.md b/plugins/life-science-research/skills/clinicaltrials-skill/SKILL.md index 7f1911d47..9d0791213 100644 --- a/plugins/life-science-research/skills/clinicaltrials-skill/SKILL.md +++ b/plugins/life-science-research/skills/clinicaltrials-skill/SKILL.md @@ -3,6 +3,15 @@ name: clinicaltrials-skill description: Submit compact ClinicalTrials.gov API v2 requests for study search, metadata, enums, search areas, and field statistics. Use when a user wants concise ClinicalTrials.gov summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `clinicaltrials-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/clinicaltrials_client.py` for all ClinicalTrials.gov v2 calls. - Study searches are better with `max_items=10` and `max_pages=1`; only increase pages when the user explicitly wants more than the first page. @@ -37,4 +46,4 @@ echo '{"action":"studies","params":{"query.cond":"prostate cancer","filter.overa ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/clinicaltrials_client.py`. +- Keep runtime imports limited to this file, `scripts/clinicaltrials_client.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/clinicaltrials-skill/scripts/clinicaltrials_client.py b/plugins/life-science-research/skills/clinicaltrials-skill/scripts/clinicaltrials_client.py index 5e399f966..2cd78f267 100644 --- a/plugins/life-science-research/skills/clinicaltrials-skill/scripts/clinicaltrials_client.py +++ b/plugins/life-science-research/skills/clinicaltrials-skill/scripts/clinicaltrials_client.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import sys from pathlib import Path @@ -55,6 +56,28 @@ def _compact(value: Any, max_items: int, max_depth: int) -> Any: return value +def _compact_study(study: Any) -> Any: + """Keep the stable trial identifier and core display fields linkable.""" + if not isinstance(study, dict): + return study + protocol = study.get("protocolSection") + protocol = protocol if isinstance(protocol, dict) else {} + identification = protocol.get("identificationModule") + identification = identification if isinstance(identification, dict) else {} + status = protocol.get("statusModule") + status = status if isinstance(status, dict) else {} + design = protocol.get("designModule") + design = design if isinstance(design, dict) else {} + return { + "nctId": identification.get("nctId"), + "briefTitle": identification.get("briefTitle"), + "officialTitle": identification.get("officialTitle"), + "overallStatus": status.get("overallStatus"), + "studyType": design.get("studyType"), + "hasResults": study.get("hasResults"), + } + + def _require_int(name: str, value: Any, default: int) -> int: if value is None: return default @@ -121,6 +144,7 @@ def execute(payload: Any) -> dict[str, Any]: pages: list[dict[str, Any]] = [] total_count: int | None = None pages_fetched = 0 + request_url: str | None = None for _ in range(config["max_pages"]): params = dict(config["params"]) if next_page_token: @@ -129,6 +153,8 @@ def execute(payload: Any) -> dict[str, Any]: BASE_URL + config["path"], params=params, timeout=config["timeout_sec"] ) response.raise_for_status() + if request_url is None: + request_url = str(response.url) page = response.json() pages.append(page) pages_fetched += 1 @@ -158,9 +184,10 @@ def execute(payload: Any) -> dict[str, Any]: "record_count_returned": min(len(studies), config["max_items"]), "record_count_available": available, "truncated": len(studies) > config["max_items"] or next_page_token is not None, - "records": _compact( - studies[: config["max_items"]], config["max_items"], config["max_depth"] - ), + "records": [ + _compact_study(study) for study in studies[: config["max_items"]] + ], + "request_url": request_url, "raw_output_path": raw_output_path, "warnings": [], } @@ -183,6 +210,7 @@ def execute(payload: Any) -> dict[str, Any]: "ok": True, "source": "clinicaltrials-v2", "action": config["action"], + "request_url": str(response.url), "raw_output_path": raw_output_path, "warnings": [], } @@ -209,6 +237,22 @@ def execute(payload: Any) -> dict[str, Any]: session.close() +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "request_url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: try: payload = json.load(sys.stdin) @@ -222,7 +266,8 @@ def main() -> int: code = 2 else: code = 0 if output.get("ok") else 1 - sys.stdout.write(json.dumps(output)) + source_url = str(output.get("request_url") or BASE_URL) + sys.stdout.write(json.dumps(_attach_sources(output, "ClinicalTrials.gov", source_url))) return code diff --git a/plugins/life-science-research/skills/clinvar-variation-skill/SKILL.md b/plugins/life-science-research/skills/clinvar-variation-skill/SKILL.md index e2bc21588..5dfd848c6 100644 --- a/plugins/life-science-research/skills/clinvar-variation-skill/SKILL.md +++ b/plugins/life-science-research/skills/clinvar-variation-skill/SKILL.md @@ -3,6 +3,15 @@ name: clinvar-variation-skill description: Submit compact ClinVar Clinical Tables and NCBI Variation requests for search, VCV, RCV, SCV, and RefSNP lookups. Use when a user wants variant-level summaries or identifier mapping --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `clinvar-variation-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/clinvar_variation.py` for all ClinVar and NCBI Variation work. - The script accepts `max_items`; for `action=search`, start around `max_items=10`. @@ -40,4 +49,4 @@ echo '{"action":"search","terms":"VCV000013080","max_items":10}' | python script ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/clinvar_variation.py`. +- Keep runtime imports limited to this file, `scripts/clinvar_variation.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/clinvar-variation-skill/scripts/clinvar_variation.py b/plugins/life-science-research/skills/clinvar-variation-skill/scripts/clinvar_variation.py index ce5d23bec..d24ccbe3d 100644 --- a/plugins/life-science-research/skills/clinvar-variation-skill/scripts/clinvar_variation.py +++ b/plugins/life-science-research/skills/clinvar-variation-skill/scripts/clinvar_variation.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import sys from pathlib import Path @@ -178,6 +179,21 @@ def execute(payload: Any) -> dict[str, Any]: return error("network_error", f"Request failed: {exc}") +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: try: payload = json.load(sys.stdin) @@ -185,7 +201,7 @@ def main() -> int: sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) return 2 output = execute(payload) - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "ClinVar and NCBI Variation", "https://www.ncbi.nlm.nih.gov/clinvar/"))) return 0 if output.get("ok") else 1 diff --git a/plugins/life-science-research/skills/efo-ontology-skill/SKILL.md b/plugins/life-science-research/skills/efo-ontology-skill/SKILL.md index bb0a15112..b6483e990 100644 --- a/plugins/life-science-research/skills/efo-ontology-skill/SKILL.md +++ b/plugins/life-science-research/skills/efo-ontology-skill/SKILL.md @@ -3,6 +3,15 @@ name: efo-ontology-skill description: Submit compact EFO OLS4 requests for search, term lookup, children, and descendants. Use when a user wants concise EFO resolution or ontology-expansion summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `efo-ontology-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all OLS4 and EFO API calls. - Use `base_url=https://www.ebi.ac.uk/ols4/api`. @@ -36,4 +45,4 @@ echo '{"base_url":"https://www.ebi.ac.uk/ols4/api","path":"search","params":{"q" ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/efo-ontology-skill/scripts/rest_request.py b/plugins/life-science-research/skills/efo-ontology-skill/scripts/rest_request.py index 4de3cf105..65520b5d6 100644 --- a/plugins/life-science-research/skills/efo-ontology-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/efo-ontology-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "Experimental Factor Ontology via OLS" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/encode-skill/SKILL.md b/plugins/life-science-research/skills/encode-skill/SKILL.md index b085fcd8f..9fec53f0a 100644 --- a/plugins/life-science-research/skills/encode-skill/SKILL.md +++ b/plugins/life-science-research/skills/encode-skill/SKILL.md @@ -3,6 +3,15 @@ name: encode-skill description: Submit compact ENCODE REST API requests for object lookups, portal-style search, and metadata retrieval. Use when a user wants concise ENCODE summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `encode-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all ENCODE API calls. - Use `base_url=https://www.encodeproject.org`. @@ -36,4 +45,4 @@ echo '{"base_url":"https://www.encodeproject.org","path":"search/","params":{"ty ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/encode-skill/scripts/rest_request.py b/plugins/life-science-research/skills/encode-skill/scripts/rest_request.py index 4de3cf105..aece3dd3c 100644 --- a/plugins/life-science-research/skills/encode-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/encode-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "ENCODE" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/ensembl-skill/SKILL.md b/plugins/life-science-research/skills/ensembl-skill/SKILL.md index 653dc4373..c5d63f32b 100644 --- a/plugins/life-science-research/skills/ensembl-skill/SKILL.md +++ b/plugins/life-science-research/skills/ensembl-skill/SKILL.md @@ -3,6 +3,15 @@ name: ensembl-skill description: Submit compact Ensembl REST API requests for lookup, overlap, cross-reference, and variation endpoints. Use when a user wants concise Ensembl summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `ensembl-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all Ensembl API calls. - Use `base_url=https://rest.ensembl.org`. @@ -36,4 +45,4 @@ echo '{"base_url":"https://rest.ensembl.org","path":"lookup/id/ENSG00000141510", ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/ensembl-skill/scripts/rest_request.py b/plugins/life-science-research/skills/ensembl-skill/scripts/rest_request.py index 4de3cf105..b00e39cca 100644 --- a/plugins/life-science-research/skills/ensembl-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/ensembl-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "Ensembl" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/epigraphdb-skill/SKILL.md b/plugins/life-science-research/skills/epigraphdb-skill/SKILL.md index ca58a9881..e38d1348a 100644 --- a/plugins/life-science-research/skills/epigraphdb-skill/SKILL.md +++ b/plugins/life-science-research/skills/epigraphdb-skill/SKILL.md @@ -3,6 +3,15 @@ name: epigraphdb-skill description: Submit compact EpiGraphDB API requests for ontology, literature, MR, gene-drug, and support-path evidence. Use when a user wants concise EpiGraphDB summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `epigraphdb-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all EpiGraphDB API calls. - Use `base_url=https://api.epigraphdb.org`. @@ -36,4 +45,4 @@ echo '{"base_url":"https://api.epigraphdb.org","path":"ontology/gwas-efo","param ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/epigraphdb-skill/scripts/rest_request.py b/plugins/life-science-research/skills/epigraphdb-skill/scripts/rest_request.py index 4de3cf105..2288856b4 100644 --- a/plugins/life-science-research/skills/epigraphdb-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/epigraphdb-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "EpiGraphDB" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/eqtl-catalogue-skill/SKILL.md b/plugins/life-science-research/skills/eqtl-catalogue-skill/SKILL.md index f0a151c6e..8ec349102 100644 --- a/plugins/life-science-research/skills/eqtl-catalogue-skill/SKILL.md +++ b/plugins/life-science-research/skills/eqtl-catalogue-skill/SKILL.md @@ -3,6 +3,15 @@ name: eqtl-catalogue-skill description: Submit compact eQTL Catalogue API requests for association retrieval and documented metadata endpoints. Use when a user wants concise public eQTL Catalogue summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `eqtl-catalogue-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all eQTL Catalogue calls. - Use `base_url=https://www.ebi.ac.uk/eqtl/api`. @@ -37,4 +46,4 @@ echo '{"base_url":"https://www.ebi.ac.uk/eqtl/api","path":"v3/studies","max_item ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/eqtl-catalogue-skill/scripts/rest_request.py b/plugins/life-science-research/skills/eqtl-catalogue-skill/scripts/rest_request.py index 7a8c1eafc..eda5a48fb 100644 --- a/plugins/life-science-research/skills/eqtl-catalogue-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/eqtl-catalogue-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -30,6 +32,31 @@ "molecular_trait_id": "", "qtl_group": "", } +SOURCE_NAME = "eQTL Catalogue" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: @@ -75,6 +102,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _drop_none_values(value: dict[str, Any]) -> dict[str, Any]: return {key: item for key, item in value.items() if item is not None} @@ -297,6 +350,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -333,6 +389,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/eva-skill/SKILL.md b/plugins/life-science-research/skills/eva-skill/SKILL.md index 76e357fe7..c71cc28f3 100644 --- a/plugins/life-science-research/skills/eva-skill/SKILL.md +++ b/plugins/life-science-research/skills/eva-skill/SKILL.md @@ -3,6 +3,15 @@ name: eva-skill description: Submit compact EVA REST requests for species metadata and archived variant lookups. Use when a user wants concise European Variation Archive summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `eva-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all EVA calls. - Use `base_url=https://www.ebi.ac.uk/eva/webservices/rest/v1`. @@ -34,4 +43,4 @@ echo '{"base_url":"https://www.ebi.ac.uk/eva/webservices/rest/v1","path":"meta/s ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/eva-skill/scripts/rest_request.py b/plugins/life-science-research/skills/eva-skill/scripts/rest_request.py index 4de3cf105..0111968fb 100644 --- a/plugins/life-science-research/skills/eva-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/eva-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "European Variation Archive" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/finngen-phewas-skill/SKILL.md b/plugins/life-science-research/skills/finngen-phewas-skill/SKILL.md index 14e02252d..5fd137c0e 100644 --- a/plugins/life-science-research/skills/finngen-phewas-skill/SKILL.md +++ b/plugins/life-science-research/skills/finngen-phewas-skill/SKILL.md @@ -3,6 +3,15 @@ name: finngen-phewas-skill description: Fetch compact FinnGen PheWAS summaries for single variants by accepting rsID, GRCh37, or GRCh38 input and resolving to the required GRCh38 query. Use when a user wants concise FinnGen association results for one variant --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `finngen-phewas-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/finngen_phewas.py` for all FinnGen PheWAS lookups. - Accept exactly one of `rsid`, `grch37`, `grch38`, or `variant`; resolve to the canonical GRCh38 `chr:pos-ref-alt` query before calling FinnGen. @@ -38,4 +47,4 @@ echo '{"grch38":"10:112998590-C-T","max_results":10}' | python scripts/finngen_p ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/finngen_phewas.py`. +- Keep runtime imports limited to this file, `scripts/finngen_phewas.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/finngen-phewas-skill/scripts/finngen_phewas.py b/plugins/life-science-research/skills/finngen-phewas-skill/scripts/finngen_phewas.py index 1cfcee457..87401b291 100644 --- a/plugins/life-science-research/skills/finngen-phewas-skill/scripts/finngen_phewas.py +++ b/plugins/life-science-research/skills/finngen-phewas-skill/scripts/finngen_phewas.py @@ -12,6 +12,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import re import sys @@ -112,6 +113,21 @@ def write_raw_json(path: Path, data: Any) -> None: path.write_text(json.dumps(data), encoding="utf-8") +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: warnings: list[str] = [] @@ -183,7 +199,7 @@ def main() -> int: "raw_output_path": None, "warnings": warnings, } - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "FinnGen PheWeb", "https://r12.finngen.fi/"))) return 0 associations = extract_associations(data) @@ -215,7 +231,7 @@ def main() -> int: "raw_output_path": saved_raw_output_path, "warnings": warnings, } - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "FinnGen PheWeb", "https://r12.finngen.fi/"))) return 0 diff --git a/plugins/life-science-research/skills/genebass-gene-burden-skill/SKILL.md b/plugins/life-science-research/skills/genebass-gene-burden-skill/SKILL.md index a997c7a31..204d3042a 100644 --- a/plugins/life-science-research/skills/genebass-gene-burden-skill/SKILL.md +++ b/plugins/life-science-research/skills/genebass-gene-burden-skill/SKILL.md @@ -3,6 +3,15 @@ name: genebass-gene-burden-skill description: Submit compact Genebass gene burden requests for one Ensembl gene ID and one burden set. Use when a user wants concise Genebass PheWAS summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `genebass-gene-burden-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/genebass_gene_burden.py` for all Genebass calls. - This skill accepts one Ensembl gene ID per invocation. @@ -33,4 +42,4 @@ echo '{"ensembl_gene_id":"ENSG00000173531","burden_set":"pLoF","max_results":25} ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/genebass_gene_burden.py`. +- Keep runtime imports limited to this file, `scripts/genebass_gene_burden.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/genebass-gene-burden-skill/scripts/genebass_gene_burden.py b/plugins/life-science-research/skills/genebass-gene-burden-skill/scripts/genebass_gene_burden.py index c4c74959a..c8de28a03 100644 --- a/plugins/life-science-research/skills/genebass-gene-burden-skill/scripts/genebass_gene_burden.py +++ b/plugins/life-science-research/skills/genebass-gene-burden-skill/scripts/genebass_gene_burden.py @@ -11,6 +11,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import re import sys @@ -191,6 +192,21 @@ def transform_rows( return out +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: warnings: list[str] = [] @@ -239,7 +255,7 @@ def main() -> int: "associations": [], "warnings": warnings, } - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "Genebass", "https://main.genebass.org/api"))) return 0 description_map: dict[str, str] = {} @@ -279,7 +295,7 @@ def main() -> int: "associations": associations, "warnings": warnings, } - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "Genebass", "https://main.genebass.org/api"))) return 0 diff --git a/plugins/life-science-research/skills/gnomad-graphql-skill/SKILL.md b/plugins/life-science-research/skills/gnomad-graphql-skill/SKILL.md index f38ed6b47..fd85d5ee2 100644 --- a/plugins/life-science-research/skills/gnomad-graphql-skill/SKILL.md +++ b/plugins/life-science-research/skills/gnomad-graphql-skill/SKILL.md @@ -3,6 +3,15 @@ name: gnomad-graphql-skill description: Submit compact gnomAD GraphQL requests for frequency, gene constraint, and variant context queries. Use when a user wants concise gnomAD summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `gnomad-graphql-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/gnomad_graphql.py` for all gnomAD GraphQL work. - For nested GraphQL results, start with `max_items=3` to `5`. @@ -35,4 +44,4 @@ echo '{"query":"query { meta { clinvar_release_date } }"}' | python scripts/gnom ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/gnomad_graphql.py`. +- Keep runtime imports limited to this file, `scripts/gnomad_graphql.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/gnomad-graphql-skill/scripts/gnomad_graphql.py b/plugins/life-science-research/skills/gnomad-graphql-skill/scripts/gnomad_graphql.py index 7057e25aa..b582b9b27 100644 --- a/plugins/life-science-research/skills/gnomad-graphql-skill/scripts/gnomad_graphql.py +++ b/plugins/life-science-research/skills/gnomad-graphql-skill/scripts/gnomad_graphql.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import sys from pathlib import Path @@ -133,6 +134,21 @@ def execute(payload: Any) -> dict[str, Any]: } +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: try: payload = json.load(sys.stdin) @@ -146,7 +162,7 @@ def main() -> int: code = 2 else: code = 0 if output.get("ok") else 1 - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "gnomAD", "https://gnomad.broadinstitute.org/api"))) return code diff --git a/plugins/life-science-research/skills/gtex-eqtl-skill/SKILL.md b/plugins/life-science-research/skills/gtex-eqtl-skill/SKILL.md index 2cbe3c3a3..1dbbd9918 100644 --- a/plugins/life-science-research/skills/gtex-eqtl-skill/SKILL.md +++ b/plugins/life-science-research/skills/gtex-eqtl-skill/SKILL.md @@ -3,6 +3,15 @@ name: gtex-eqtl-skill description: Fetch GTEx single-tissue eQTL associations from one variant input by accepting rsID, GRCh37, or GRCh38 input and resolving to the required GRCh38 query for the GTEx v2 API. Use when a user wants eQTL associations returned as JSON. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `gtex-eqtl-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + # Operating rules - Use Python `requests` for all network calls. diff --git a/plugins/life-science-research/skills/gtex-eqtl-skill/scripts/gtex_eqtl.py b/plugins/life-science-research/skills/gtex-eqtl-skill/scripts/gtex_eqtl.py index 1b3dd82b1..662b8f34c 100644 --- a/plugins/life-science-research/skills/gtex-eqtl-skill/scripts/gtex_eqtl.py +++ b/plugins/life-science-research/skills/gtex-eqtl-skill/scripts/gtex_eqtl.py @@ -12,6 +12,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import sys from typing import Any @@ -58,16 +59,20 @@ def build_variant_id(parsed: dict[str, Any]) -> str: return f"chr{parsed['chr']}_{parsed['pos']}_{parsed['ref']}_{parsed['alt']}_b38" -def fetch_eqtls(variant_id: str) -> Any: +def build_request_url(variant_id: str) -> str: encoded = requests.utils.quote(variant_id, safe="") - url = f"{GTEX_API}/association/singleTissueEqtl?variantId={encoded}" + return f"{GTEX_API}/association/singleTissueEqtl?variantId={encoded}" + + +def fetch_eqtls(variant_id: str) -> tuple[Any, str]: + url = build_request_url(variant_id) headers = { "Accept": "application/json", "User-Agent": USER_AGENT, } resp = requests.get(url, headers=headers, timeout=DEFAULT_TIMEOUT_S) resp.raise_for_status() - return resp.json() + return resp.json(), str(resp.url) def extract_rows(data: Any) -> list[Any]: @@ -78,6 +83,22 @@ def extract_rows(data: Any) -> list[Any]: return [] +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "request_url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: warnings: list[str] = [] @@ -110,7 +131,7 @@ def main() -> int: return 1 try: - data = fetch_eqtls(parsed["variant_id"]) + data, request_url = fetch_eqtls(parsed["variant_id"]) except requests.RequestException as exc: sys.stdout.write(json.dumps(error("network_error", f"GTEx request failed: {exc}"))) return 1 @@ -138,7 +159,7 @@ def main() -> int: "paging_info": paging_info, "warnings": warnings, } - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "GTEx Portal", request_url))) return 0 diff --git a/plugins/life-science-research/skills/gwas-catalog-skill/SKILL.md b/plugins/life-science-research/skills/gwas-catalog-skill/SKILL.md index 4d90a8e6f..77eb69006 100644 --- a/plugins/life-science-research/skills/gwas-catalog-skill/SKILL.md +++ b/plugins/life-science-research/skills/gwas-catalog-skill/SKILL.md @@ -3,6 +3,15 @@ name: gwas-catalog-skill description: Submit compact GWAS Catalog REST API v2 requests for studies, associations, SNPs, EFO traits, genes, publications, loci, and metadata. Use when a user wants concise GWAS Catalog summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `gwas-catalog-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all GWAS Catalog API calls. - Use `base_url=https://www.ebi.ac.uk/gwas/rest/api/v2`. @@ -38,4 +47,4 @@ echo '{"base_url":"https://www.ebi.ac.uk/gwas/rest/api/v2","path":"studies","par ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/gwas-catalog-skill/scripts/rest_request.py b/plugins/life-science-research/skills/gwas-catalog-skill/scripts/rest_request.py index 4de3cf105..e3c72303f 100644 --- a/plugins/life-science-research/skills/gwas-catalog-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/gwas-catalog-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "NHGRI-EBI GWAS Catalog" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/hmdb-skill/SKILL.md b/plugins/life-science-research/skills/hmdb-skill/SKILL.md index bee98f03f..988628063 100644 --- a/plugins/life-science-research/skills/hmdb-skill/SKILL.md +++ b/plugins/life-science-research/skills/hmdb-skill/SKILL.md @@ -3,6 +3,15 @@ name: hmdb-skill description: Submit compact HMDB search requests for metabolites, proteins, diseases, and pathways. Use when a user wants concise HMDB summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `hmdb-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all HMDB calls. - Use `base_url=https://hmdb.ca`. @@ -35,4 +44,4 @@ echo '{"base_url":"https://hmdb.ca","path":"unearth/q","params":{"query":"seroto ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/hmdb-skill/scripts/rest_request.py b/plugins/life-science-research/skills/hmdb-skill/scripts/rest_request.py index 4de3cf105..f3db6e2cf 100644 --- a/plugins/life-science-research/skills/hmdb-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/hmdb-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "Human Metabolome Database" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/human-protein-atlas-skill/SKILL.md b/plugins/life-science-research/skills/human-protein-atlas-skill/SKILL.md index f00102316..b8ee35c9b 100644 --- a/plugins/life-science-research/skills/human-protein-atlas-skill/SKILL.md +++ b/plugins/life-science-research/skills/human-protein-atlas-skill/SKILL.md @@ -3,6 +3,15 @@ name: human-protein-atlas-skill description: Submit compact Human Protein Atlas requests for gene JSON, search downloads, and page-level tissue or cell-line lookups. Use when a user wants concise Human Protein Atlas summaries; save raw JSON or HTML only on request. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `human-protein-atlas-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all Human Protein Atlas calls. - Use `base_url=https://www.proteinatlas.org`. @@ -37,4 +46,4 @@ echo '{"base_url":"https://www.proteinatlas.org","path":"ENSG00000141510.json"}' ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/human-protein-atlas-skill/scripts/rest_request.py b/plugins/life-science-research/skills/human-protein-atlas-skill/scripts/rest_request.py index 4de3cf105..d39bed73d 100644 --- a/plugins/life-science-research/skills/human-protein-atlas-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/human-protein-atlas-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "Human Protein Atlas" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/ipd-skill/SKILL.md b/plugins/life-science-research/skills/ipd-skill/SKILL.md index e7b2e0e9c..665e959fb 100644 --- a/plugins/life-science-research/skills/ipd-skill/SKILL.md +++ b/plugins/life-science-research/skills/ipd-skill/SKILL.md @@ -3,6 +3,15 @@ name: ipd-skill description: Submit compact IPD REST requests for HLA allele and cell-level metadata using the public IPD query API. Use when a user wants concise IPD summaries; save raw JSON or text only on request. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `ipd-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all IPD calls. - Use `base_url=https://www.ebi.ac.uk/cgi-bin/ipd/api`. @@ -35,4 +44,4 @@ echo '{"base_url":"https://www.ebi.ac.uk/cgi-bin/ipd/api","path":"allele","param ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/ipd-skill/scripts/rest_request.py b/plugins/life-science-research/skills/ipd-skill/scripts/rest_request.py index 4de3cf105..49dd97ee0 100644 --- a/plugins/life-science-research/skills/ipd-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/ipd-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "IPD-IMGT/HLA" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/locus-to-gene-mapper-skill/SKILL.md b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/SKILL.md index 70d94fbd8..c2da00695 100644 --- a/plugins/life-science-research/skills/locus-to-gene-mapper-skill/SKILL.md +++ b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/SKILL.md @@ -3,6 +3,15 @@ name: locus-to-gene-mapper-skill description: Map GWAS loci to ranked candidate genes using a deterministic multi-skill chain (EFO -> GWAS -> coordinates -> Open Targets L2G/coloc -> eQTL -> burden/coding context), with reproducible tables and optional figures. Use when a user provides a trait/EFO term and/or lead variants and needs locus-to-gene prioritization for downstream biology decisions. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `locus-to-gene-mapper-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Locus-to-Gene Mapper Generate a reproducible locus-to-gene mapping for one trait (or a seed set of lead variants), with explicit evidence attribution and conservative confidence labels. @@ -345,6 +354,9 @@ Return: - Never silently impute missing evidence as positive support. - When evidence is missing, record it as a limitation and reduce confidence. - Keep evidence provenance explicit (`source skill` + endpoint family) in rationale lines. +- Distinguish evidence-contributing sources from queried sources that returned no + mapping evidence. Attach links only to claims supported by the contributing + inputs; keep queried-but-empty sources in methods, provenance, or limitations. ## Non-Goals diff --git a/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/map_locus_to_gene.py b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/map_locus_to_gene.py index 49c2c2762..aeb27ab79 100644 --- a/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/map_locus_to_gene.py +++ b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/map_locus_to_gene.py @@ -20,6 +20,19 @@ GNOMAD_BASE = "https://gnomad.broadinstitute.org/api" REFSNP_BASE = "https://api.ncbi.nlm.nih.gov/variation/v0/refsnp" +PROVENANCE_SOURCE_CATALOG = { + "efo-ontology-skill": ( + "Experimental Factor Ontology via OLS", + "https://www.ebi.ac.uk/ols4/ontologies/efo", + ), + "gwas-catalog-skill": ("NHGRI-EBI GWAS Catalog", "https://www.ebi.ac.uk/gwas/"), + "ncbi-refsnp": ("NCBI RefSNP", "https://www.ncbi.nlm.nih.gov/snp/"), + "opentargets-skill": ("Open Targets Platform", "https://platform.opentargets.org/"), + "gtex-eqtl-skill": ("GTEx Portal", "https://gtexportal.org/"), + "genebass-gene-burden-skill": ("Genebass", "https://app.genebass.org/"), + "gnomad-graphql-skill": ("gnomAD", "https://gnomad.broadinstitute.org/"), +} + DEFAULT_LOCUS_PADDING_BP = 1_000_000 REFSEQ_CHROMOSOMES = {f"NC_{i:06d}": str(i) for i in range(1, 23)} REFSEQ_CHROMOSOMES.update({"NC_000023": "X", "NC_000024": "Y", "NC_012920": "MT"}) @@ -30,6 +43,35 @@ REPO_ROOT / "genebass-gene-burden-skill" / "scripts" / "genebass_gene_burden.py" ) + +def record_provenance( + statuses: dict[str, str], source_key: str, *, contributed: bool +) -> None: + """Record one source lane, retaining the strongest observed status.""" + if source_key not in PROVENANCE_SOURCE_CATALOG: + raise ValueError(f"Unknown provenance source: {source_key}") + if contributed or source_key not in statuses: + statuses[source_key] = "contributed" if contributed else "queried_no_evidence" + + +def build_provenance_sources( + statuses: dict[str, str], retrieved_at: str +) -> list[dict[str, str]]: + sources: list[dict[str, str]] = [] + for source_key, (name, url) in PROVENANCE_SOURCE_CATALOG.items(): + status = statuses.get(source_key) + if status is None: + continue + sources.append( + { + "name": name, + "url": url, + "status": status, + "retrieved_at": retrieved_at, + } + ) + return sources + TOKEN_STOPWORDS = { "disease", "disorder", @@ -1686,6 +1728,7 @@ def group_anchors_by_locus(anchors: list[dict[str, Any]]) -> list[dict[str, Any] def map_locus_to_gene(input_json: dict[str, Any]) -> dict[str, Any]: warnings: list[str] = [] limitations: list[str] = [] + provenance_statuses: dict[str, str] = {} normalized_input: dict[str, Any] = dict(input_json) @@ -1731,12 +1774,34 @@ def map_locus_to_gene(input_json: dict[str, Any]) -> dict[str, Any]: figure_output_dir = Path(str(normalized_input.get("figure_output_dir") or "./output/figures")) efo_payload = resolve_efo(trait_query, warnings, limitations) + if trait_query: + record_provenance( + provenance_statuses, + "efo-ontology-skill", + contributed=bool(efo_payload.get("efo_id")), + ) if efo_id_input: efo_payload["efo_id"] = efo_id_input + phenotype_terms_input = as_string_list(normalized_input.get("phenotype_terms")) + gwas_queried = bool(trait_query or efo_id_input or phenotype_terms_input) anchors = build_anchors(normalized_input, efo_payload, warnings, limitations) if not anchors: raise ValueError("No anchors remained after normalization.") + if gwas_queried: + record_provenance( + provenance_statuses, + "gwas-catalog-skill", + contributed=any( + anchor.get("accession_id") or safe_float(anchor.get("p_value")) is not None + for anchor in anchors + ), + ) + record_provenance( + provenance_statuses, + "ncbi-refsnp", + contributed=any(anchor.get("grch38") or anchor.get("grch37") for anchor in anchors), + ) unresolved_coord_rsids = [ str(anchor.get("rsid")) @@ -1773,10 +1838,29 @@ def map_locus_to_gene(input_json: dict[str, Any]) -> dict[str, Any]: limitations=limitations, warnings=warnings, ) + gtex_queried = any( + coerce_dict(anchor.get("grch38")).get("chr") is not None + and coerce_dict(anchor.get("grch38")).get("pos") is not None + and coerce_dict(anchor.get("grch38")).get("ref") + and coerce_dict(anchor.get("grch38")).get("alt") + for anchor in anchors + ) + if gtex_queried: + record_provenance( + provenance_statuses, + "gtex-eqtl-skill", + contributed=any(bool(gene_support) for gene_support in gtex_support.values()), + ) refsnp_annotations = ( fetch_refsnp_annotations(anchor_rsids, limitations) if include_clinvar else {} ) + if include_clinvar and anchor_rsids: + record_provenance( + provenance_statuses, + "ncbi-refsnp", + contributed=bool(refsnp_annotations), + ) grouped_loci = group_anchors_by_locus(anchors) @@ -1809,6 +1893,20 @@ def map_locus_to_gene(input_json: dict[str, Any]) -> dict[str, Any]: unique_symbols = dedupe_keep_order(all_candidate_symbols) symbol_to_ensembl = resolve_ensembl_ids_for_symbols(unique_symbols, limitations) + ot_queried = bool((anchor_rsids and trait_terms) or unique_symbols) + if ot_queried: + ot_evidence = bool(symbol_to_ensembl) or bool(ot_support.get("matched_study_loci")) + if not ot_evidence: + for support in coerce_dict(ot_support.get("per_anchor")).values(): + support_dict = coerce_dict(support) + if support_dict.get("l2g") or support_dict.get("coloc"): + ot_evidence = True + break + record_provenance( + provenance_statuses, + "opentargets-skill", + contributed=ot_evidence, + ) genebass_support = fetch_genebass_support( symbol_to_ensembl=symbol_to_ensembl, burden_sets=burden_sets, @@ -1816,10 +1914,27 @@ def map_locus_to_gene(input_json: dict[str, Any]) -> dict[str, Any]: max_results=int(normalized_input.get("genebass_max_results") or 300), limitations=limitations, ) + if symbol_to_ensembl: + genebass_evidence = any( + safe_float(support.get("best_p")) is not None + or int(support.get("supporting_rows") or 0) > 0 + for support in genebass_support.values() + ) + record_provenance( + provenance_statuses, + "genebass-gene-burden-skill", + contributed=genebass_evidence, + ) gnomad_constraints = ( fetch_gnomad_gene_constraints(unique_symbols, limitations) if include_gnomad_context else {} ) + if include_gnomad_context and unique_symbols: + record_provenance( + provenance_statuses, + "gnomad-graphql-skill", + contributed=bool(gnomad_constraints), + ) hpa_support: dict[str, list[str]] = {} if include_hpa_tissue_context: @@ -2044,32 +2159,28 @@ def map_locus_to_gene(input_json: dict[str, Any]) -> dict[str, Any]: f"overall_score outside [0,1] for gene {gene.get('symbol')} in locus {locus.get('locus_id')}" ) + retrieved_at = now_iso() + provenance_sources = build_provenance_sources(provenance_statuses, retrieved_at) + mapping_payload: dict[str, Any] = { "meta": { "trait_query": trait_query, "efo_id": efo_payload.get("efo_id"), "generated_at": now_iso(), - "sources_queried": [ - "efo-ontology-skill", - "gwas-catalog-skill", - "ncbi-refsnp-coordinate-resolution", - "opentargets-skill", - "gtex-eqtl-skill", - "genebass-gene-burden-skill", - "clinvar-variation-skill" - if include_clinvar - else "clinvar-variation-skill(skipped)", - "gnomad-graphql-skill" - if include_gnomad_context - else "gnomad-graphql-skill(skipped)", - "human-protein-atlas-skill" - if include_hpa_tissue_context - else "human-protein-atlas-skill(skipped)", + "sources_queried": list(provenance_statuses), + "sources_contributing": [ + key for key, status in provenance_statuses.items() if status == "contributed" + ], + "sources_without_evidence": [ + key + for key, status in provenance_statuses.items() + if status == "queried_no_evidence" ], }, "anchors": anchors, "loci": loci_output, "cross_locus_ranked_genes": cross_locus_ranked_genes, + "sources": provenance_sources, "warnings": dedupe_keep_order(warnings), "limitations": dedupe_keep_order(limitations), } @@ -2118,6 +2229,7 @@ def map_locus_to_gene(input_json: dict[str, Any]) -> dict[str, Any]: "Paste `inline_image_markdown` lines directly in the chat as plain markdown. " "Do not wrap them in code fences." ), + "sources": provenance_sources, "warnings": dedupe_keep_order(warnings), "limitations": dedupe_keep_order(limitations), } diff --git a/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/test_map_locus_to_gene.py b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/test_map_locus_to_gene.py index 5fa488141..f7ef62251 100644 --- a/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/test_map_locus_to_gene.py +++ b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/test_map_locus_to_gene.py @@ -146,5 +146,41 @@ def test_fetch_refsnp_annotations_uses_gene_locus_symbols(self) -> None: self.assertIn("intron_variant", annotations["rs7903146"]["consequence_terms"]) +class ProvenanceTests(unittest.TestCase): + def test_provenance_only_contains_recorded_lanes(self) -> None: + statuses: dict[str, str] = {} + map_locus_to_gene.record_provenance( + statuses, "ncbi-refsnp", contributed=True + ) + map_locus_to_gene.record_provenance( + statuses, "gtex-eqtl-skill", contributed=False + ) + + sources = map_locus_to_gene.build_provenance_sources( + statuses, "2026-06-23T00:00:00+00:00" + ) + + self.assertEqual( + [source["name"] for source in sources], ["NCBI RefSNP", "GTEx Portal"] + ) + self.assertEqual( + [source["status"] for source in sources], + ["contributed", "queried_no_evidence"], + ) + self.assertNotIn("Experimental Factor Ontology via OLS", {s["name"] for s in sources}) + self.assertNotIn("Human Protein Atlas", {s["name"] for s in sources}) + + def test_contribution_promotes_a_queried_lane(self) -> None: + statuses: dict[str, str] = {} + map_locus_to_gene.record_provenance( + statuses, "opentargets-skill", contributed=False + ) + map_locus_to_gene.record_provenance( + statuses, "opentargets-skill", contributed=True + ) + + self.assertEqual(statuses["opentargets-skill"], "contributed") + + if __name__ == "__main__": unittest.main() diff --git a/plugins/life-science-research/skills/metabolights-skill/SKILL.md b/plugins/life-science-research/skills/metabolights-skill/SKILL.md index 16d4e754f..5d1866c0b 100644 --- a/plugins/life-science-research/skills/metabolights-skill/SKILL.md +++ b/plugins/life-science-research/skills/metabolights-skill/SKILL.md @@ -3,6 +3,15 @@ name: metabolights-skill description: Submit compact MetaboLights requests for study discovery and study-level metabolomics metadata. Use when a user wants concise MetaboLights summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `metabolights-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all MetaboLights calls. - Use `base_url=https://www.ebi.ac.uk/metabolights/ws`. @@ -34,4 +43,4 @@ echo '{"base_url":"https://www.ebi.ac.uk/metabolights/ws","path":"studies","reco ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/metabolights-skill/scripts/rest_request.py b/plugins/life-science-research/skills/metabolights-skill/scripts/rest_request.py index 4de3cf105..f10e8a3c2 100644 --- a/plugins/life-science-research/skills/metabolights-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/metabolights-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "MetaboLights" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/mgnify-skill/SKILL.md b/plugins/life-science-research/skills/mgnify-skill/SKILL.md index 564ea0c8d..d6c2a0a99 100644 --- a/plugins/life-science-research/skills/mgnify-skill/SKILL.md +++ b/plugins/life-science-research/skills/mgnify-skill/SKILL.md @@ -3,6 +3,15 @@ name: mgnify-skill description: Submit compact MGnify API requests for microbiome studies, samples, and biome metadata. Use when a user wants concise MGnify summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `mgnify-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all MGnify calls. - Use `base_url=https://www.ebi.ac.uk/metagenomics/api/v1`. @@ -34,4 +43,4 @@ echo '{"base_url":"https://www.ebi.ac.uk/metagenomics/api/v1","path":"studies"," ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/mgnify-skill/scripts/rest_request.py b/plugins/life-science-research/skills/mgnify-skill/scripts/rest_request.py index 4de3cf105..f96991e07 100644 --- a/plugins/life-science-research/skills/mgnify-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/mgnify-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "MGnify" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/ncbi-blast-skill/SKILL.md b/plugins/life-science-research/skills/ncbi-blast-skill/SKILL.md index 1bde04924..c9f6e9d8e 100644 --- a/plugins/life-science-research/skills/ncbi-blast-skill/SKILL.md +++ b/plugins/life-science-research/skills/ncbi-blast-skill/SKILL.md @@ -3,6 +3,15 @@ name: ncbi-blast-skill description: Submit, poll, and summarize NCBI BLAST Common URL API jobs (Blast.cgi) for nucleotide or protein sequences. Use when a user wants RID status, BLAST results, or compact top-hit summaries; fetch raw Text/JSON2 only on request. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `ncbi-blast-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/ncbi_blast.py` for all concrete BLAST work. diff --git a/plugins/life-science-research/skills/ncbi-blast-skill/scripts/ncbi_blast.py b/plugins/life-science-research/skills/ncbi-blast-skill/scripts/ncbi_blast.py index 65ced8e7c..3a4441175 100644 --- a/plugins/life-science-research/skills/ncbi-blast-skill/scripts/ncbi_blast.py +++ b/plugins/life-science-research/skills/ncbi-blast-skill/scripts/ncbi_blast.py @@ -8,6 +8,7 @@ from __future__ import annotations +from datetime import datetime, timezone import io import json import os @@ -737,6 +738,21 @@ def execute( local_session.close() +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: try: payload = json.load(sys.stdin) @@ -757,7 +773,7 @@ def main() -> int: else: exit_code = 1 - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "NCBI BLAST", "https://blast.ncbi.nlm.nih.gov/Blast.cgi"))) return exit_code diff --git a/plugins/life-science-research/skills/ncbi-clinicaltables-skill/SKILL.md b/plugins/life-science-research/skills/ncbi-clinicaltables-skill/SKILL.md index 982cc26b3..8a7c9e978 100644 --- a/plugins/life-science-research/skills/ncbi-clinicaltables-skill/SKILL.md +++ b/plugins/life-science-research/skills/ncbi-clinicaltables-skill/SKILL.md @@ -3,6 +3,15 @@ name: ncbi-clinicaltables-skill description: Submit compact Clinical Tables NCBI Gene requests for human gene lookup, pagination, and field selection. Use when a user wants concise autocomplete-style human gene search results --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `ncbi-clinicaltables-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/ncbi_gene_clinicaltables.py` for all Clinical Tables gene searches. - The script accepts `max_items`; for search pages, start with `count=10` and `max_items=10`. @@ -39,4 +48,4 @@ echo '{"terms":"TP53","params":{"count":10,"df":"GeneID,Symbol,description"},"ma ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/ncbi_gene_clinicaltables.py`. +- Keep runtime imports limited to this file, `scripts/ncbi_gene_clinicaltables.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/ncbi-clinicaltables-skill/scripts/ncbi_gene_clinicaltables.py b/plugins/life-science-research/skills/ncbi-clinicaltables-skill/scripts/ncbi_gene_clinicaltables.py index a1118b8f3..6f09f366c 100644 --- a/plugins/life-science-research/skills/ncbi-clinicaltables-skill/scripts/ncbi_gene_clinicaltables.py +++ b/plugins/life-science-research/skills/ncbi-clinicaltables-skill/scripts/ncbi_gene_clinicaltables.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import sys from pathlib import Path @@ -131,6 +132,21 @@ def execute(payload: Any) -> dict[str, Any]: return error("network_error", f"Request failed: {exc}") +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: try: payload = json.load(sys.stdin) @@ -138,7 +154,7 @@ def main() -> int: sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) return 2 output = execute(payload) - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "NCBI Gene via Clinical Tables", "https://clinicaltables.nlm.nih.gov/api/ncbi_genes/v3/search"))) return 0 if output.get("ok") else 1 diff --git a/plugins/life-science-research/skills/ncbi-datasets-skill/SKILL.md b/plugins/life-science-research/skills/ncbi-datasets-skill/SKILL.md index aa80feb43..2c746473e 100644 --- a/plugins/life-science-research/skills/ncbi-datasets-skill/SKILL.md +++ b/plugins/life-science-research/skills/ncbi-datasets-skill/SKILL.md @@ -3,6 +3,15 @@ name: ncbi-datasets-skill description: Submit compact NCBI Datasets v2 requests for assembly, genome, taxonomy, and related metadata endpoints. Use when a user wants concise NCBI Datasets summaries; save raw JSON or text only on request. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `ncbi-datasets-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/ncbi_datasets.py` for all Datasets v2 calls in this package. - Use explicit REST `path` values relative to `https://api.ncbi.nlm.nih.gov/datasets/v2`. @@ -36,4 +45,4 @@ echo '{"path":"genome/taxon/9606/dataset_report","params":{"page_size":10},"reco ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/ncbi_datasets.py`. +- Keep runtime imports limited to this file, `scripts/ncbi_datasets.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/ncbi-datasets-skill/scripts/ncbi_datasets.py b/plugins/life-science-research/skills/ncbi-datasets-skill/scripts/ncbi_datasets.py index bf3300010..393c4a257 100644 --- a/plugins/life-science-research/skills/ncbi-datasets-skill/scripts/ncbi_datasets.py +++ b/plugins/life-science-research/skills/ncbi-datasets-skill/scripts/ncbi_datasets.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import sys from pathlib import Path @@ -212,6 +213,21 @@ def execute(payload: Any) -> dict[str, Any]: return error("network_error", f"Request failed: {exc}") +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: try: payload = json.load(sys.stdin) @@ -225,7 +241,7 @@ def main() -> int: code = 2 else: code = 0 if output.get("ok") else 1 - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "NCBI Datasets", "https://api.ncbi.nlm.nih.gov/datasets/v2"))) return code diff --git a/plugins/life-science-research/skills/ncbi-entrez-skill/SKILL.md b/plugins/life-science-research/skills/ncbi-entrez-skill/SKILL.md index 7f99e45ec..e7284f764 100644 --- a/plugins/life-science-research/skills/ncbi-entrez-skill/SKILL.md +++ b/plugins/life-science-research/skills/ncbi-entrez-skill/SKILL.md @@ -3,6 +3,15 @@ name: ncbi-entrez-skill description: Submit compact NCBI Entrez E-Utilities requests for PubMed, Gene, Protein, Nucleotide, PMC metadata, and GEO metadata workflows. Use when a user wants concise Entrez search, fetch, summary, or link results; save raw JSON or XML only on request. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `ncbi-entrez-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/ncbi_entrez.py` for all Entrez calls in this package. - Use explicit `endpoint` values such as `esearch`, `esummary`, `efetch`, `elink`, or `einfo`. @@ -14,7 +23,7 @@ description: Submit compact NCBI Entrez E-Utilities requests for PubMed, Gene, P ## Execution behavior - Return concise markdown summaries from the script output by default. -- In final user-facing summaries, never display a bare PMID or DOI. Render every PMID as a Markdown link in the form `[PMID ](https://pubmed.ncbi.nlm.nih.gov//)` and every DOI as `[](https://doi.org/)`, including in tables, bullets, parentheticals, and source lists. +- Apply the source registry templates to every PMID, DOI, NCBI Gene ID, or GEO accession that appears in the summary, including in tables, bullets, parentheticals, and source lists. - Return raw JSON or XML only if the user explicitly asks for machine-readable output. - Prefer targeted endpoint calls instead of broad unfiltered dumps. - If the user needs the full raw response, set `save_raw=true` and report the saved file path. @@ -41,4 +50,4 @@ echo '{"endpoint":"esearch","params":{"db":"gene","term":"TP53[gene] AND human[o ## References - Load `references/geo.md` only when the user specifically needs GEO query patterns. -- Keep the import package limited to this file, `references/geo.md`, and `scripts/ncbi_entrez.py`. +- Keep runtime imports limited to this file, `references/geo.md`, `scripts/ncbi_entrez.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/ncbi-entrez-skill/scripts/ncbi_entrez.py b/plugins/life-science-research/skills/ncbi-entrez-skill/scripts/ncbi_entrez.py index 6ae059930..50fade362 100644 --- a/plugins/life-science-research/skills/ncbi-entrez-skill/scripts/ncbi_entrez.py +++ b/plugins/life-science-research/skills/ncbi-entrez-skill/scripts/ncbi_entrez.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import os import sys @@ -279,6 +280,21 @@ def execute(payload: Any) -> dict[str, Any]: return error("network_error", f"Request failed: {exc}") +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: try: payload = json.load(sys.stdin) @@ -292,7 +308,7 @@ def main() -> int: code = 2 else: code = 0 if output.get("ok") else 1 - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "NCBI Entrez", "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"))) return code diff --git a/plugins/life-science-research/skills/ncbi-pmc-skill/SKILL.md b/plugins/life-science-research/skills/ncbi-pmc-skill/SKILL.md index aa16f57ed..829f1b58b 100644 --- a/plugins/life-science-research/skills/ncbi-pmc-skill/SKILL.md +++ b/plugins/life-science-research/skills/ncbi-pmc-skill/SKILL.md @@ -3,6 +3,15 @@ name: ncbi-pmc-skill description: Submit compact NCBI PMC Open Access requests for article/file availability metadata. Use when a user wants concise PMC Open Access summaries; save raw XML only on request. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `ncbi-pmc-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/ncbi_pmc.py` for all PMC Open Access calls in this package. - This skill is intentionally narrow: it currently covers the PMC Open Access service rather than the full PMC API surface. @@ -34,4 +43,4 @@ echo '{"params":{"id":"PMC3257301"},"max_items":10}' | python scripts/ncbi_pmc.p ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/ncbi_pmc.py`. +- Keep runtime imports limited to this file, `scripts/ncbi_pmc.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/ncbi-pmc-skill/scripts/ncbi_pmc.py b/plugins/life-science-research/skills/ncbi-pmc-skill/scripts/ncbi_pmc.py index b4fa12075..31275a5d2 100644 --- a/plugins/life-science-research/skills/ncbi-pmc-skill/scripts/ncbi_pmc.py +++ b/plugins/life-science-research/skills/ncbi-pmc-skill/scripts/ncbi_pmc.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import os import sys @@ -235,6 +236,21 @@ def execute(payload: Any) -> dict[str, Any]: return error("network_error", f"Request failed: {exc}") +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: try: payload = json.load(sys.stdin) @@ -248,7 +264,7 @@ def main() -> int: code = 2 else: code = 0 if output.get("ok") else 1 - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "PubMed Central", "https://pmc.ncbi.nlm.nih.gov/"))) return code diff --git a/plugins/life-science-research/skills/opentargets-skill/SKILL.md b/plugins/life-science-research/skills/opentargets-skill/SKILL.md index 2ae06278a..b50d2bebb 100644 --- a/plugins/life-science-research/skills/opentargets-skill/SKILL.md +++ b/plugins/life-science-research/skills/opentargets-skill/SKILL.md @@ -3,6 +3,15 @@ name: opentargets-skill description: Submit compact Open Targets Platform GraphQL requests for target, disease, drug, variant, study, and search data, including associated-disease datasource heatmap matrices. Use when a user wants concise Open Targets summaries or per-datasource evidence context --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `opentargets-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/opentargets_graphql.py` for all Open Targets GraphQL work. - Use `scripts/opentargets_disease_heatmap.py` when the user wants the associated-disease bubble grid or a disease-by-datasource evidence matrix. @@ -56,4 +65,4 @@ The helper paginates `associatedDiseases`, collects `datasourceScores`, and retu Use the disease-name filter as a client-side substring filter similar to the UI. If you later need the overall association score column, inspect the GraphQL row type first before adding candidate fields such as `score` or `associationScore`. ## References -- No additional runtime references are required; keep the import package limited to this file and the bundled scripts in `scripts/`. +- Keep runtime imports limited to this file, the bundled scripts in `scripts/`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_disease_heatmap.py b/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_disease_heatmap.py index def0c3fd0..9e6a175eb 100644 --- a/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_disease_heatmap.py +++ b/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_disease_heatmap.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import sys from pathlib import Path @@ -323,6 +324,21 @@ def execute(payload: Any) -> dict[str, Any]: } +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: try: payload = json.load(sys.stdin) @@ -336,7 +352,7 @@ def main() -> int: code = 2 else: code = 0 if output.get("ok") else 1 - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "Open Targets Platform", "https://api.platform.opentargets.org/api/v4/graphql"))) return code diff --git a/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_graphql.py b/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_graphql.py index f7d624f2f..ee5775bed 100644 --- a/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_graphql.py +++ b/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_graphql.py @@ -3,6 +3,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import sys from pathlib import Path @@ -133,6 +134,21 @@ def execute(payload: Any) -> dict[str, Any]: } +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: try: payload = json.load(sys.stdin) @@ -146,7 +162,7 @@ def main() -> int: code = 2 else: code = 0 if output.get("ok") else 1 - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "Open Targets Platform", "https://api.platform.opentargets.org/api/v4/graphql"))) return code diff --git a/plugins/life-science-research/skills/pharmgkb-skill/SKILL.md b/plugins/life-science-research/skills/pharmgkb-skill/SKILL.md index 8a72c644f..136d32668 100644 --- a/plugins/life-science-research/skills/pharmgkb-skill/SKILL.md +++ b/plugins/life-science-research/skills/pharmgkb-skill/SKILL.md @@ -3,6 +3,15 @@ name: pharmgkb-skill description: Submit compact PharmGKB API requests for genes, variants, clinical annotations, dosing guidelines, and search. Use when a user wants concise PharmGKB summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `pharmgkb-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all PharmGKB API calls. - Use `base_url=https://api.pharmgkb.org/v1/data`. @@ -35,4 +44,4 @@ echo '{"base_url":"https://api.pharmgkb.org/v1/data","path":"gene/PA36679"}' | p ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/pharmgkb-skill/scripts/rest_request.py b/plugins/life-science-research/skills/pharmgkb-skill/scripts/rest_request.py index 4de3cf105..bef8c63fe 100644 --- a/plugins/life-science-research/skills/pharmgkb-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/pharmgkb-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "PharmGKB" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/pride-skill/SKILL.md b/plugins/life-science-research/skills/pride-skill/SKILL.md index d17253480..b1008e2da 100644 --- a/plugins/life-science-research/skills/pride-skill/SKILL.md +++ b/plugins/life-science-research/skills/pride-skill/SKILL.md @@ -3,6 +3,15 @@ name: pride-skill description: Submit compact PRIDE Archive API requests for proteomics project discovery and project-level metadata. Use when a user wants concise PRIDE summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `pride-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all PRIDE Archive calls. - Use `base_url=https://www.ebi.ac.uk/pride/ws/archive/v2`. @@ -34,4 +43,4 @@ echo '{"base_url":"https://www.ebi.ac.uk/pride/ws/archive/v2","path":"projects", ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/pride-skill/scripts/rest_request.py b/plugins/life-science-research/skills/pride-skill/scripts/rest_request.py index 4de3cf105..1ed344f2f 100644 --- a/plugins/life-science-research/skills/pride-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/pride-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "PRIDE Archive" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/proteomexchange-skill/SKILL.md b/plugins/life-science-research/skills/proteomexchange-skill/SKILL.md index f67cf3628..1151f2e0e 100644 --- a/plugins/life-science-research/skills/proteomexchange-skill/SKILL.md +++ b/plugins/life-science-research/skills/proteomexchange-skill/SKILL.md @@ -3,6 +3,15 @@ name: proteomexchange-skill description: Submit compact ProteomeXchange PROXI requests for datasets, libraries, peptidoforms, proteins, PSMs, spectra, and USI examples. Use when a user wants concise PROXI summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `proteomexchange-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all ProteomeXchange PROXI calls. - Use `base_url=https://proteomecentral.proteomexchange.org/api/proxi/v0.1`. @@ -36,4 +45,4 @@ echo '{"base_url":"https://proteomecentral.proteomexchange.org/api/proxi/v0.1"," ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/proteomexchange-skill/scripts/rest_request.py b/plugins/life-science-research/skills/proteomexchange-skill/scripts/rest_request.py index 4de3cf105..863c09f31 100644 --- a/plugins/life-science-research/skills/proteomexchange-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/proteomexchange-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "ProteomeXchange" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/pubchem-pug-skill/SKILL.md b/plugins/life-science-research/skills/pubchem-pug-skill/SKILL.md index 84b1e1d47..14510ace8 100644 --- a/plugins/life-science-research/skills/pubchem-pug-skill/SKILL.md +++ b/plugins/life-science-research/skills/pubchem-pug-skill/SKILL.md @@ -3,6 +3,15 @@ name: pubchem-pug-skill description: Submit compact PubChem PUG REST requests for compound properties, descriptions, assay summaries, and substance metadata. Use when a user wants concise PubChem summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `pubchem-pug-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all PubChem PUG calls. - Use `base_url=https://pubchem.ncbi.nlm.nih.gov/rest/pug`. @@ -35,4 +44,4 @@ echo '{"base_url":"https://pubchem.ncbi.nlm.nih.gov/rest/pug","path":"compound/n ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/pubchem-pug-skill/scripts/rest_request.py b/plugins/life-science-research/skills/pubchem-pug-skill/scripts/rest_request.py index 4de3cf105..6851ca722 100644 --- a/plugins/life-science-research/skills/pubchem-pug-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/pubchem-pug-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "PubChem" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/quickgo-skill/SKILL.md b/plugins/life-science-research/skills/quickgo-skill/SKILL.md index 3252c8096..0b92732a4 100644 --- a/plugins/life-science-research/skills/quickgo-skill/SKILL.md +++ b/plugins/life-science-research/skills/quickgo-skill/SKILL.md @@ -3,6 +3,15 @@ name: quickgo-skill description: Submit compact QuickGO requests for GO terms, annotations, and ontology traversal. Use when a user wants concise QuickGO summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `quickgo-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all QuickGO API calls. - Use `base_url=https://www.ebi.ac.uk/QuickGO/services`. @@ -36,4 +45,4 @@ echo '{"base_url":"https://www.ebi.ac.uk/QuickGO/services","path":"ontology/go/t ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/quickgo-skill/scripts/rest_request.py b/plugins/life-science-research/skills/quickgo-skill/scripts/rest_request.py index 4de3cf105..535a46d1f 100644 --- a/plugins/life-science-research/skills/quickgo-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/quickgo-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "QuickGO" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/rcsb-pdb-skill/SKILL.md b/plugins/life-science-research/skills/rcsb-pdb-skill/SKILL.md index 2bb3e626a..6a01fafdf 100644 --- a/plugins/life-science-research/skills/rcsb-pdb-skill/SKILL.md +++ b/plugins/life-science-research/skills/rcsb-pdb-skill/SKILL.md @@ -3,6 +3,15 @@ name: rcsb-pdb-skill description: Submit compact RCSB PDB requests for core metadata, Search API queries, and FASTA downloads. Use when a user wants concise RCSB summaries; save raw JSON or FASTA only on request. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `rcsb-pdb-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all RCSB PDB and Search API calls. - Use `base_url=https://data.rcsb.org/rest/v1` for core metadata, `https://search.rcsb.org/rcsbsearch/v2` for Search API, and `https://www.rcsb.org` for FASTA downloads. @@ -35,4 +44,4 @@ echo '{"base_url":"https://data.rcsb.org/rest/v1","path":"core/entry/4hhb"}' | p ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/rcsb-pdb-skill/scripts/rest_request.py b/plugins/life-science-research/skills/rcsb-pdb-skill/scripts/rest_request.py index 4de3cf105..516de6c08 100644 --- a/plugins/life-science-research/skills/rcsb-pdb-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/rcsb-pdb-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "RCSB Protein Data Bank" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/reactome-skill/SKILL.md b/plugins/life-science-research/skills/reactome-skill/SKILL.md index 407f5ff3b..3140dac19 100644 --- a/plugins/life-science-research/skills/reactome-skill/SKILL.md +++ b/plugins/life-science-research/skills/reactome-skill/SKILL.md @@ -3,6 +3,15 @@ name: reactome-skill description: Submit compact Reactome ContentService requests for pathway, event, participant, search, and diagram-related data. Use when a user wants concise Reactome summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `reactome-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all Reactome ContentService calls. - Use `base_url=https://reactome.org/ContentService`. @@ -36,4 +45,4 @@ echo '{"base_url":"https://reactome.org/ContentService","path":"data/query/R-HSA ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/reactome-skill/scripts/rest_request.py b/plugins/life-science-research/skills/reactome-skill/scripts/rest_request.py index 4de3cf105..1a0d2ed63 100644 --- a/plugins/life-science-research/skills/reactome-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/reactome-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "Reactome" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/research-router-skill/SKILL.md b/plugins/life-science-research/skills/research-router-skill/SKILL.md index 9aff213ef..2b6671890 100644 --- a/plugins/life-science-research/skills/research-router-skill/SKILL.md +++ b/plugins/life-science-research/skills/research-router-skill/SKILL.md @@ -3,6 +3,15 @@ name: research-router-skill description: Route broad or ambiguous life-sciences research requests to the right skills, normalize core entities, optionally parallelize independent evidence gathering with subagents when available, and synthesize a concise evidence-backed answer. Use when a user asks a general life-sciences question that could span multiple sources or analysis types. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `research-router-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Research Router Use this skill as the default orchestration layer for broad life-sciences research requests. @@ -115,6 +124,7 @@ When delegating, give each subagent a bounded read-only objective such as one ev - the key findings - the main caveats - which skills or sources it used +- the structured `sources` entries returned by those skills - any artifact paths it produced The coordinating agent is responsible for reconciling overlaps, contradictions, and evidence gaps. @@ -130,6 +140,17 @@ Unless the user asks for a different format, include: 3. main caveats or unresolved questions 4. recommended next analyses or follow-up lookups +For every lane that returns substantive evidence, retain the downstream skill's +`sources` entries and place the most specific available link next to the claim +it supports. Do not seed the router with example citations or cite the routing +decision itself. When a lane only performs a connectivity or schema check, +returns no evidence, or fails, keep its attempted source in an optional methods, +limitations, or `Sources checked` note without presenting it as supporting +evidence. When several claims use the same record, reuse the same link; when +sources disagree, keep each link attached to its corresponding finding. A +deduplicated final `Sources` list is optional and supplements rather than +replaces applicable claim-adjacent links. + If the task is exploratory, explicitly distinguish: - evidence that supports a conclusion diff --git a/plugins/life-science-research/skills/rhea-skill/SKILL.md b/plugins/life-science-research/skills/rhea-skill/SKILL.md index d330a6c47..03c3fde90 100644 --- a/plugins/life-science-research/skills/rhea-skill/SKILL.md +++ b/plugins/life-science-research/skills/rhea-skill/SKILL.md @@ -3,6 +3,15 @@ name: rhea-skill description: Submit compact Rhea reaction search requests for biochemical reactions and reaction IDs. Use when a user wants concise Rhea summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `rhea-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all Rhea calls. - Use `base_url=https://www.rhea-db.org`. @@ -34,4 +43,4 @@ echo '{"base_url":"https://www.rhea-db.org","path":"rhea","params":{"query":"caf ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/rhea-skill/scripts/rest_request.py b/plugins/life-science-research/skills/rhea-skill/scripts/rest_request.py index 4de3cf105..dde29e6b0 100644 --- a/plugins/life-science-research/skills/rhea-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/rhea-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "Rhea" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/rnacentral-skill/SKILL.md b/plugins/life-science-research/skills/rnacentral-skill/SKILL.md index d399558af..f4d17c163 100644 --- a/plugins/life-science-research/skills/rnacentral-skill/SKILL.md +++ b/plugins/life-science-research/skills/rnacentral-skill/SKILL.md @@ -3,6 +3,15 @@ name: rnacentral-skill description: Submit compact RNAcentral API requests for RNA entry browsing, single-entry lookup, and cross-reference retrieval. Use when a user wants concise RNAcentral summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `rnacentral-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all RNAcentral calls. - Use `base_url=https://rnacentral.org/api/v1`. @@ -36,4 +45,4 @@ echo '{"base_url":"https://rnacentral.org/api/v1","path":"rna/URS000075C808/9606 ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/rnacentral-skill/scripts/rest_request.py b/plugins/life-science-research/skills/rnacentral-skill/scripts/rest_request.py index 4de3cf105..1750ae059 100644 --- a/plugins/life-science-research/skills/rnacentral-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/rnacentral-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "RNAcentral" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/string-skill/SKILL.md b/plugins/life-science-research/skills/string-skill/SKILL.md index 3132a9286..649421d01 100644 --- a/plugins/life-science-research/skills/string-skill/SKILL.md +++ b/plugins/life-science-research/skills/string-skill/SKILL.md @@ -3,6 +3,15 @@ name: string-skill description: Submit compact STRING API requests for network, interaction partner, and enrichment endpoints. Use when a user wants concise STRING summaries --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `string-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all STRING API calls. - Use `base_url=https://string-db.org/api/json`. @@ -38,4 +47,4 @@ echo '{"base_url":"https://string-db.org/api/json","path":"network","method":"PO ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/string-skill/scripts/rest_request.py b/plugins/life-science-research/skills/string-skill/scripts/rest_request.py index 4de3cf105..390f41a92 100644 --- a/plugins/life-science-research/skills/string-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/string-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "STRING" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/skills/tpmi-phewas-skill/SKILL.md b/plugins/life-science-research/skills/tpmi-phewas-skill/SKILL.md index fc74a118b..591d87b32 100644 --- a/plugins/life-science-research/skills/tpmi-phewas-skill/SKILL.md +++ b/plugins/life-science-research/skills/tpmi-phewas-skill/SKILL.md @@ -3,6 +3,15 @@ name: tpmi-phewas-skill description: Fetch compact TPMI PheWAS summaries for single variants by accepting rsID, GRCh37, or GRCh38 input and resolving to the required GRCh38 query. Use when a user wants concise TPMI association results for one variant --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `tpmi-phewas-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/tpmi_phewas.py` for all TPMI PheWAS lookups. - Accept exactly one of `rsid`, `grch37`, `grch38`, or `variant`; resolve to the canonical GRCh38 `chr:pos-ref-alt` query before calling TPMI. @@ -38,4 +47,4 @@ echo '{"grch38":"6:160540105-T-C","max_results":10}' | python scripts/tpmi_phewa ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/tpmi_phewas.py`. +- Keep runtime imports limited to this file, `scripts/tpmi_phewas.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/tpmi-phewas-skill/scripts/tpmi_phewas.py b/plugins/life-science-research/skills/tpmi-phewas-skill/scripts/tpmi_phewas.py index 1a62a78ed..45ecdc6b8 100644 --- a/plugins/life-science-research/skills/tpmi-phewas-skill/scripts/tpmi_phewas.py +++ b/plugins/life-science-research/skills/tpmi-phewas-skill/scripts/tpmi_phewas.py @@ -12,6 +12,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import re import sys @@ -114,6 +115,21 @@ def write_raw_json(path: Path, data: Any) -> None: path.write_text(json.dumps(data), encoding="utf-8") +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: warnings: list[str] = [] @@ -184,7 +200,7 @@ def main() -> int: "raw_output_path": None, "warnings": warnings, } - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "Taiwan Precision Medicine Initiative PheWeb", "https://pheweb.ibms.sinica.edu.tw/"))) return 0 associations = extract_associations(data) @@ -220,7 +236,7 @@ def main() -> int: "raw_output_path": saved_raw_output_path, "warnings": warnings, } - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "Taiwan Precision Medicine Initiative PheWeb", "https://pheweb.ibms.sinica.edu.tw/"))) return 0 diff --git a/plugins/life-science-research/skills/ukb-topmed-phewas-skill/SKILL.md b/plugins/life-science-research/skills/ukb-topmed-phewas-skill/SKILL.md index 9ebbc5de1..8a66ecc14 100644 --- a/plugins/life-science-research/skills/ukb-topmed-phewas-skill/SKILL.md +++ b/plugins/life-science-research/skills/ukb-topmed-phewas-skill/SKILL.md @@ -3,6 +3,15 @@ name: ukb-topmed-phewas-skill description: Fetch compact UKB-TOPMed PheWAS summaries for single variants by accepting rsID, GRCh37, or GRCh38 input and resolving to the required GRCh38 query. Use when a user wants concise UKB-TOPMed association results for one variant --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `ukb-topmed-phewas-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/ukb_topmed_phewas.py` for all UKB-TOPMed PheWAS lookups. - Accept exactly one of `rsid`, `grch37`, `grch38`, or `variant`; resolve to the canonical GRCh38 `chr:pos-ref-alt` query before calling UKB-TOPMed. @@ -38,4 +47,4 @@ echo '{"grch38":"10:112998590-C-T","max_results":10}' | python scripts/ukb_topme ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/ukb_topmed_phewas.py`. +- Keep runtime imports limited to this file, `scripts/ukb_topmed_phewas.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/ukb-topmed-phewas-skill/scripts/ukb_topmed_phewas.py b/plugins/life-science-research/skills/ukb-topmed-phewas-skill/scripts/ukb_topmed_phewas.py index c6a1d9f72..7817915e4 100644 --- a/plugins/life-science-research/skills/ukb-topmed-phewas-skill/scripts/ukb_topmed_phewas.py +++ b/plugins/life-science-research/skills/ukb-topmed-phewas-skill/scripts/ukb_topmed_phewas.py @@ -12,6 +12,7 @@ from __future__ import annotations +from datetime import datetime, timezone import json import re import sys @@ -112,6 +113,21 @@ def write_raw_json(path: Path, data: Any) -> None: path.write_text(json.dumps(data), encoding="utf-8") +def _attach_sources( + output: dict[str, Any], source_name: str, source_url: str +) -> dict[str, Any]: + """Add stable user-facing provenance without changing error payloads.""" + if output.get("ok") and "sources" not in output: + output["sources"] = [ + { + "name": source_name, + "url": source_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + return output + + def main() -> int: warnings: list[str] = [] @@ -184,7 +200,7 @@ def main() -> int: "raw_output_path": None, "warnings": warnings, } - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "UKB-TOPMed PheWeb", "https://pheweb.org/UKB-TOPMed/"))) return 0 associations = extract_associations(data) @@ -220,7 +236,7 @@ def main() -> int: "raw_output_path": saved_raw_output_path, "warnings": warnings, } - sys.stdout.write(json.dumps(output)) + sys.stdout.write(json.dumps(_attach_sources(output, "UKB-TOPMed PheWeb", "https://pheweb.org/UKB-TOPMed/"))) return 0 diff --git a/plugins/life-science-research/skills/uniprot-skill/SKILL.md b/plugins/life-science-research/skills/uniprot-skill/SKILL.md index 24a5de11e..8a901b145 100644 --- a/plugins/life-science-research/skills/uniprot-skill/SKILL.md +++ b/plugins/life-science-research/skills/uniprot-skill/SKILL.md @@ -3,6 +3,15 @@ name: uniprot-skill description: Submit compact UniProt REST API requests for UniProtKB, UniRef, UniParc, and FASTA stream endpoints. Use when a user wants concise UniProt summaries; save raw JSON or FASTA only on request. --- +## Source presentation + +- Follow `../../references/source-presentation.md` for every final user-facing answer. +- Use the `uniprot-skill` entry in `../../references/source-links.json` for authoritative source names and canonical record URL templates. +- Preserve structured `sources` metadata for provenance, but add claim-adjacent Markdown links only for substantive external claims supported by the response. +- Do not force evidence links for connectivity or schema checks, source metadata, empty results, failures, routing-only answers, or sources that returned no supporting evidence. +- Prefer canonical record pages, fall back to sanitized `sources[].request_url` or authoritative `sources[].url` values, and never invent unsupported deep links. +- Preserve explicitly requested raw or machine-readable output without injecting Markdown links. + ## Operating rules - Use `scripts/rest_request.py` for all UniProt API calls. - Use `base_url=https://rest.uniprot.org`. @@ -38,4 +47,4 @@ echo '{"base_url":"https://rest.uniprot.org","path":"uniprotkb/search","params": ``` ## References -- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. +- Keep runtime imports limited to this file, `scripts/rest_request.py`, `../../references/source-presentation.md`, and `../../references/source-links.json`. diff --git a/plugins/life-science-research/skills/uniprot-skill/scripts/rest_request.py b/plugins/life-science-research/skills/uniprot-skill/scripts/rest_request.py index 4de3cf105..84dd7d23f 100644 --- a/plugins/life-science-research/skills/uniprot-skill/scripts/rest_request.py +++ b/plugins/life-science-research/skills/uniprot-skill/scripts/rest_request.py @@ -5,8 +5,10 @@ import json import sys +from datetime import datetime, timezone from pathlib import Path from typing import Any +from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit try: import requests @@ -17,6 +19,33 @@ REQUESTS_IMPORT_ERROR = None +SOURCE_NAME = "UniProt" + + +SENSITIVE_QUERY_KEYS = { + "api_key", + "apikey", + "access_token", + "auth", + "authorization", + "bearer", + "client_secret", + "code", + "credential", + "credentials", + "jwt", + "key", + "private_key", + "password", + "refresh_token", + "secret", + "session", + "sig", + "signature", + "token", +} + + def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} @@ -60,6 +89,32 @@ def _service_name(base_url: str) -> str: return host.replace(".", "-") +def _sanitize_request_url(url: str) -> str: + """Return a shareable request URL without secret-like query values.""" + parts = urlsplit(url) + query = [] + for key, value in parse_qsl(parts.query, keep_blank_values=True): + normalized_key = key.casefold().replace("-", "_") + is_sensitive = normalized_key in SENSITIVE_QUERY_KEYS or normalized_key.endswith( + ("_credential", "_jwt", "_key", "_password", "_secret", "_signature", "_token") + ) + query.append((key, "REDACTED" if is_sensitive else value)) + netloc = parts.netloc.rsplit("@", 1)[-1] + return urlunsplit((parts.scheme, netloc, parts.path, urlencode(query), "")) + + +def _sources(source_name: str, request_url: str) -> list[dict[str, str]]: + sanitized_url = _sanitize_request_url(request_url) + return [ + { + "name": source_name, + "url": sanitized_url, + "request_url": sanitized_url, + "retrieved_at": datetime.now(timezone.utc).isoformat(), + } + ] + + def _build_url(base_url: str, path: str) -> str: if path.startswith(("http://", "https://")): return path @@ -224,6 +279,9 @@ def execute(payload: Any) -> dict[str, Any]: "status_code": response.status_code, "record_path": path_used, "raw_output_path": raw_output_path, + "sources": _sources( + SOURCE_NAME, str(response.url) + ), "warnings": [], } if isinstance(target, list): @@ -260,6 +318,7 @@ def execute(payload: Any) -> dict[str, Any]: if raw_output_path else len(text_head) < len(response.text), "raw_output_path": raw_output_path, + "sources": _sources(SOURCE_NAME, str(response.url)), "warnings": [], } except ValueError as exc: diff --git a/plugins/life-science-research/tests/test_source_presentation.py b/plugins/life-science-research/tests/test_source_presentation.py new file mode 100644 index 000000000..bd5e2e2ac --- /dev/null +++ b/plugins/life-science-research/tests/test_source_presentation.py @@ -0,0 +1,229 @@ +from __future__ import annotations + +import importlib.util +import json +import sys +import unittest +from pathlib import Path + +PLUGIN_ROOT = Path(__file__).resolve().parents[1] +VALIDATOR_PATH = PLUGIN_ROOT / "scripts" / "validate_source_presentation.py" +REGISTRY_PATH = PLUGIN_ROOT / "references" / "source-links.json" +CONTRACT_PATH = PLUGIN_ROOT / "references" / "source-presentation.md" +SKILLS_DIR = PLUGIN_ROOT / "skills" + + +def _load_module(name: str, path: Path): + spec = importlib.util.spec_from_file_location(name, path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Could not load {path}") + module = importlib.util.module_from_spec(spec) + sys.modules[name] = module + spec.loader.exec_module(module) + return module + + +class SourcePresentationTests(unittest.TestCase): + def test_full_contract_validation(self) -> None: + validator = _load_module("source_presentation_validator", VALIDATOR_PATH) + self.assertEqual([], validator.validate()) + + def test_contract_defines_conditional_output_modes(self) -> None: + contract = CONTRACT_PATH.read_text(encoding="utf-8") + normalized_contract = " ".join(contract.split()) + for output_mode in ( + "Record or evidence lookup", + "Search or result list", + "Connectivity or schema check", + "Source metadata or service status", + "Empty result or failed request", + "Router or planner", + "Local synthesis or derived analysis", + "Raw machine-readable output", + ): + self.assertIn(output_mode, contract) + self.assertIn( + "every substantive externally sourced claim should remain traceable", + normalized_contract, + ) + self.assertIn( + "not every skill invocation needs a clickable evidence link", + normalized_contract, + ) + + def test_all_skill_headers_use_v2_conditional_contract(self) -> None: + skill_paths = sorted(SKILLS_DIR.glob("*/SKILL.md")) + self.assertEqual(50, len(skill_paths)) + for skill_path in skill_paths: + text = skill_path.read_text(encoding="utf-8") + self.assertIn("", text) + self.assertIn( + "only for substantive external claims supported by the response", + text, + ) + self.assertIn( + "Do not force evidence links for connectivity or schema checks", + text, + ) + self.assertNotIn("", text) + + def test_router_and_mapper_handle_non_evidence_sources_explicitly(self) -> None: + router = (SKILLS_DIR / "research-router-skill" / "SKILL.md").read_text( + encoding="utf-8" + ) + self.assertIn("Do not seed the router with example citations", router) + self.assertIn("returns no evidence, or fails", router) + self.assertNotIn("UniProt P01116", router) + self.assertNotIn("R-HSA-6802949", router) + + mapper = (SKILLS_DIR / "locus-to-gene-mapper-skill" / "SKILL.md").read_text( + encoding="utf-8" + ) + self.assertIn("Distinguish evidence-contributing sources", mapper) + self.assertIn("queried sources that returned no", mapper) + self.assertIn("queried-but-empty sources", mapper) + + def test_representative_canonical_links(self) -> None: + registry = json.loads(REGISTRY_PATH.read_text(encoding="utf-8"))["skills"] + cases = { + ("uniprot-skill", "UniProtKB accession"): ( + "P01116", + "https://www.uniprot.org/uniprotkb/P01116/entry", + ), + ("reactome-skill", "Reactome stable ID"): ( + "R-HSA-6802949", + "https://reactome.org/content/detail/R-HSA-6802949", + ), + ("clinicaltrials-skill", "NCT ID"): ( + "NCT01234567", + "https://clinicaltrials.gov/study/NCT01234567", + ), + ("rcsb-pdb-skill", "PDB ID"): ( + "4OBE", + "https://www.rcsb.org/structure/4OBE", + ), + ("clinvar-variation-skill", "numeric ClinVar Variation ID"): ( + "13080", + "https://www.ncbi.nlm.nih.gov/clinvar/variation/13080/", + ), + ("pride-skill", "PRIDE project accession"): ( + "PXD000001", + "https://www.ebi.ac.uk/pride/archive/projects/PXD000001", + ), + ("ncbi-entrez-skill", "PMID"): ( + "12345678", + "https://pubmed.ncbi.nlm.nih.gov/12345678/", + ), + } + for (skill_name, identifier_type), (identifier, expected) in cases.items(): + templates = registry[skill_name]["record_url_templates"] + template = next( + item["template"] + for item in templates + if item["identifier_type"] == identifier_type + ) + self.assertEqual(expected, template.format(id=identifier)) + + def test_generic_client_redacts_secret_query_values(self) -> None: + client_path = ( + PLUGIN_ROOT / "skills" / "uniprot-skill" / "scripts" / "rest_request.py" + ) + client = _load_module("uniprot_rest_request", client_path) + sanitized = client._sanitize_request_url( # noqa: SLF001 + "https://alice:password@example.org/record?" + "id=P01116&api_key=secret&token=hidden&sig=signed&code=oauth" + "#access_token=fragment-secret" + ) + self.assertTrue(sanitized.startswith("https://example.org/record?")) + self.assertIn("id=P01116", sanitized) + self.assertNotIn("alice", sanitized) + self.assertNotIn("password", sanitized) + self.assertNotIn("secret", sanitized) + self.assertNotIn("hidden", sanitized) + self.assertNotIn("signed", sanitized) + self.assertNotIn("oauth", sanitized) + self.assertNotIn("#", sanitized) + self.assertEqual(4, sanitized.count("REDACTED")) + + def test_generic_clients_use_registry_display_names(self) -> None: + registry = json.loads(REGISTRY_PATH.read_text(encoding="utf-8"))["skills"] + client_paths = sorted(SKILLS_DIR.glob("*/scripts/rest_request.py")) + self.assertEqual(31, len(client_paths)) + + for index, client_path in enumerate(client_paths): + skill_name = client_path.parents[1].name + client = _load_module(f"generic_source_client_{index}", client_path) + self.assertEqual( + registry[skill_name]["source_name"], + client.SOURCE_NAME, + ) + + def test_gtex_source_url_reproduces_the_variant_query(self) -> None: + script_dir = PLUGIN_ROOT / "skills" / "gtex-eqtl-skill" / "scripts" + sys.path.insert(0, str(script_dir)) + try: + client = _load_module( + "gtex_eqtl_source_client", script_dir / "gtex_eqtl.py" + ) + finally: + sys.path.remove(str(script_dir)) + + self.assertEqual( + client.build_request_url("chr10_112998590_C_T_b38"), + "https://gtexportal.org/api/v2/association/singleTissueEqtl?" + "variantId=chr10_112998590_C_T_b38", + ) + + def test_clinicaltrials_summary_keeps_record_link_fields(self) -> None: + client_path = ( + PLUGIN_ROOT + / "skills" + / "clinicaltrials-skill" + / "scripts" + / "clinicaltrials_client.py" + ) + client = _load_module("clinicaltrials_source_client", client_path) + summary = client._compact_study( # noqa: SLF001 + { + "protocolSection": { + "identificationModule": { + "nctId": "NCT01234567", + "briefTitle": "Representative study", + "officialTitle": "Representative official study title", + }, + "statusModule": {"overallStatus": "RECRUITING"}, + "designModule": {"studyType": "INTERVENTIONAL"}, + }, + "hasResults": False, + } + ) + + self.assertEqual(summary["nctId"], "NCT01234567") + self.assertEqual(summary["briefTitle"], "Representative study") + self.assertEqual(summary["overallStatus"], "RECRUITING") + + def test_provenance_helper_leaves_errors_unchanged(self) -> None: + client_path = ( + PLUGIN_ROOT / "skills" / "civic-skill" / "scripts" / "civic_graphql.py" + ) + client = _load_module("civic_graphql_client", client_path) + payload = {"ok": False, "error": {"code": "example", "message": "failed"}} + self.assertIs( + payload, client._attach_sources(payload, "CIViC", "https://civicdb.org/") + ) + self.assertNotIn("sources", payload) + + def test_provenance_helper_adds_authoritative_source_url(self) -> None: + client_path = ( + PLUGIN_ROOT / "skills" / "civic-skill" / "scripts" / "civic_graphql.py" + ) + client = _load_module("civic_graphql_success_client", client_path) + payload = {"ok": True, "summary": {"variant": "KRAS G12C"}} + output = client._attach_sources(payload, "CIViC", "https://civicdb.org/") + self.assertEqual("CIViC", output["sources"][0]["name"]) + self.assertEqual("https://civicdb.org/", output["sources"][0]["url"]) + self.assertNotIn("request_url", output["sources"][0]) + + +if __name__ == "__main__": + unittest.main()