From 574d446edca76bc7383ad58f355dbb98fd696371 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Sun, 23 Nov 2025 13:52:52 +0000 Subject: [PATCH 1/4] Fixup CLI docs --- docs/cli.md | 85 +++++++++++++++++++++++++++++++++++++++++++++------- sc2ts/cli.py | 2 +- 2 files changed, 76 insertions(+), 11 deletions(-) diff --git a/docs/cli.md b/docs/cli.md index 7fe88be..890a706 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -33,19 +33,84 @@ following order: ## CLI reference - - - - +% A note on cross references... There's some weird long-standing problem with +% cross referencing program values in Sphinx, which means that we can't use +% the built-in labels generated by sphinx-click. We can make our own explicit +% targets, but these have to have slightly weird names to avoid conflicting +% with what sphinx-click is doing. So, hence the cmd- prefix. +% Based on: https://github.com/skypilot-org/skypilot/pull/2834 -:::{todo} -Add the sphinx-click output here somehow. -::: +### Data import + +```{eval-rst} +.. _cmd-sc2ts-import-alignments: +.. click:: sc2ts.cli:import_alignments + :prog: sc2ts import-alignments +``` + +```{eval-rst} +.. _cmd-sc2ts-import-metadata: +.. click:: sc2ts.cli:import_metadata + :prog: sc2ts import-metadata +``` + +### Inference + +```{eval-rst} +.. _cmd-sc2ts-infer: +.. click:: sc2ts.cli:infer + :prog: sc2ts infer +``` + +### Inspection + +```{eval-rst} +.. _cmd-sc2ts-info-dataset: +.. click:: sc2ts.cli:info_dataset + :prog: sc2ts info-dataset +``` + +```{eval-rst} +.. _cmd-sc2ts-info-matches: +.. click:: sc2ts.cli:info_matches + :prog: sc2ts info-matches +``` + +### Postprocessing + +```{eval-rst} +.. _cmd-sc2ts-postprocess: +.. click:: sc2ts.cli:postprocess + :prog: sc2ts postprocess +``` + +```{eval-rst} +.. _cmd-sc2ts-map-parsimony: +.. click:: sc2ts.cli:map_parsimony + :prog: sc2ts map-parsimony +``` + +```{eval-rst} +.. _cmd-sc2ts-minimise-metadata: +.. click:: sc2ts.cli:minimise_metadata + :prog: sc2ts minimise-metadata +``` + +### Miscellaneous + +% For some reason this one isn't working. Not worth worrying about. - - - + + + +```{eval-rst} +.. _cmd-sc2ts-run-hmm: +.. click:: sc2ts.cli:run_hmm + :prog: sc2ts run-hmm +``` + + diff --git a/sc2ts/cli.py b/sc2ts/cli.py index 57c97e9..cb6183e 100644 --- a/sc2ts/cli.py +++ b/sc2ts/cli.py @@ -130,7 +130,7 @@ def setup_logging(verbosity, log_file=None, date=None): is_flag=True, flag_value=True, help=( - "If true, initialise a new dataset. WARNING! This will erase and existing " + "If true, initialise a new dataset. WARNING! This will erase an existing " "store" ), ) From 648debec71996fe0fa4a72beede39db2ec395451 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Sun, 23 Nov 2025 14:23:16 +0000 Subject: [PATCH 2/4] Document toml format --- docs/example_config.toml | 39 ++++++++++++++++++++++++++++++++++++--- docs/inference.md | 14 +++++++++++--- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/docs/example_config.toml b/docs/example_config.toml index f3879fb..daf187b 100644 --- a/docs/example_config.toml +++ b/docs/example_config.toml @@ -1,6 +1,13 @@ + +# This is a path to the dataset, in VCZ format. dataset="viridian_mafft_2024-10-14_v1.vcz.zip" +# The metadata field used for dates. For the Viridian dataset, this is +# "Date_tree" (which means, "date used to partition samples when building +# the Viridian tree") date_field="Date_tree" +# The run_id is a prefix added to all output files. This is useful when +# running lots of different parameter combinations. run_id="ex1" # Configure where the result files are stored. For simplicity # we put them all in the "example_inference" directory. @@ -13,8 +20,9 @@ matches_dir= "example_inference/" # This is full debug output, which is verbose (but useful!) log_level = 2 -# Dates to exclude from inference. This one is a large outlier in terms of the -# numbers of samples, and enriched for incorrectly assigned dates. +# Dates to exclude from inference. This one is a large outlier in the +# Viridian data in terms of the numbers of samples, and enriched for +# incorrectly assigned dates. exclude_dates = ["2020-12-31"] # The set of site positions to mask during inference (list of integers). @@ -23,24 +31,49 @@ exclude_dates = ["2020-12-31"] exclude_sites = [] [extend_parameters] +# The recombination penalty "k" parameter num_mismatches=4 +# Any samples with a HMM cost <= to this value are included in the ARG hmm_cost_threshold=7 +# The maximum number of missing sites for a sample to be considered max_missing_sites=500 +# Do we mask deletions as missing data? deletions_as_missing=true +# The maximum number of samples to consider, per day # max_daily_samples=1000 -# Knobs for tuning retro group insertion +## Various knobs for tuning retro group insertion: + +# The minimum number of samples in a retro group min_group_size=10 +# The minimum number of mutations shared by all samples min_root_mutations=2 +# The maxmimum number of recurrent mutations in the group tree max_recurrent_mutations=2 +# The maxmimum number of mutations per sample, overall max_mutations_per_sample=5 +# The size of the windown in which to consider samples for retrospective +# inclusion, in days. retrospective_window=7 +## Performance parameters. + +# The number of matching threads to use. -1 means use all available cores. +# Note that this will likely not make much difference until large numbers +# of samples per days are involved. num_threads=-1 +# An approximate ceiling on the total amount of memory used (in GiB) by HMM +# matching. Once the memory used goes above this value, new HMM match jobs are +# held back until it goes under it again. If many memory intensive match jobs +# are run at once however, this will not prevent them from exceeding this +# limit. memory_limit=32 +# A list of sample IDs (strings) for unconditional inclusion (e.g., to +# help seed major saltation events). include_samples=[] +# Override specific parameter values over a time period. [[override]] start = "2020-01-01" stop = "2020-03-01" diff --git a/docs/inference.md b/docs/inference.md index 53bf95b..58728da 100644 --- a/docs/inference.md +++ b/docs/inference.md @@ -94,9 +94,17 @@ debugging metadata included (see the section on the Debug utilities below) Primary inference can be stopped and picked up again at any point using the ``--start`` option. -:::{todo} -Add documentation for the toml config file -::: + + + +### Config file format + +All parameters for primary inference are specified using the [toml](https://toml.io/en/) +config file. There are documented in the example config file used here: + +```{literalinclude} example_config.toml +:language: toml +``` ## Postprocessing From 0f181b7a0fa1053c1adb9c41f8de3139815ece72 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Sun, 23 Nov 2025 14:30:03 +0000 Subject: [PATCH 3/4] Fixup signposts --- docs/inference.md | 2 +- docs/intro.md | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/docs/inference.md b/docs/inference.md index 58728da..1109253 100644 --- a/docs/inference.md +++ b/docs/inference.md @@ -8,7 +8,7 @@ on a local machine using an example config file, using the Viridian data downloa from Zenodo. Inference is performed using the CLI, which is composed of number of subcommands. -See {ref}`sc2ts_sec_cli` section for more information +See the {ref}`sc2ts_sec_cli` section for more information ## Prerequisites diff --git a/docs/intro.md b/docs/intro.md index 8f2fd08..5a169d3 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -11,12 +11,20 @@ It consists of: 3. A lightweight wrapper around [Zarr](https://zarr.dev) for convenient access to the Viridian dataset (alignments and metadata) in VCF Zarr format. -The underlying methods are described in the sc2ts [preprint]( +The methods are described in the sc2ts [preprint]( ). -Most users will use the {ref}`sec_python_api` to perform {ref}`sec_arg_analysis` -on the sc2ts inferred ARG or {ref}`sec_alignments_analysis` on the -Zarr-formatted Viridian dataset distributed on Zenodo. -Uses who wish to perform {ref}`sec_inference` use the -{ref}`sc2ts_sec_cli`. +## Quickstart + +- See the {ref}`sec_inference` section for an example of running +primary inference using the {ref}`sc2ts_sec_cli`. + +- See the {ref}`sec_arg_analysis` section for examples of using the +{ref}`sec_python_api` to analyse the sc2ts Viridian ARG. + +- See the {ref}`sec_alignments_analysis` section for examples +of using the {ref}`sec_python_api` to analyse the Viridian +alignments and metadata in +[VCF Zarr format](https://doi.org/10.1093/gigascience/giaf049). + From 3c17ff21400e5dc83aea2c70956f673529479288 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Sun, 23 Nov 2025 14:39:05 +0000 Subject: [PATCH 4/4] Update docs requirements --- pyproject.toml | 7 +++++++ sc2ts/cli.py | 1 - 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 39951c8..4b3c2b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,13 @@ docs = [ "sphinx-argparse==0.5.2", "sphinx-issues==5.0.1", "IPython", + # docs requires running the CLI, which means we need to full inference + # requirements also + "scipy", + "biotite", + "tsinfer>=0.5", + "pyfaidx", + "numba", ] [build-system] diff --git a/sc2ts/cli.py b/sc2ts/cli.py index cb6183e..8df17e6 100644 --- a/sc2ts/cli.py +++ b/sc2ts/cli.py @@ -16,7 +16,6 @@ import tqdm import tskit import tszip -import tsinfer import click import humanize import pandas as pd