diff --git a/examples/README.md b/examples/README.md index 4368e33..fd6b7c3 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,7 +1,5 @@ ## GenomicsDB simple query tool -Note that there is `run.sh` bash script for ease of use and if you do not want to invoke the genomicsdb_query CLI directly. - Simple GenomicsDB query tool `genomicsdb_query`, given a workspace and genomic intervals of the form `:-`. The intervals at a minimum need to have the contig specified, start and end are optional. e.g chr1:100-1000, chr1:100 and chr1 are all valid. Start defaults to 1 if not specified and end defaults to the length of the contig if not specified. Assumption : The workspace should have been created with the `vcf2genomicsdb` tool or with `gatk GenomicsDBImport` and should exist. @@ -10,7 +8,7 @@ Assumption : The workspace should have been created with the `vcf2genomicsdb` to ~/GenomicsDB-Python/examples: ./genomicsdb_query --help usage: query [options] -GenomicsDB simple query with samples/intervals/filter as inputs +GenomicsDB simple query with samples/intervals/attributes/filter as inputs options: -h, --help show this help message and exit @@ -25,8 +23,9 @@ options: -l LOADER, --loader LOADER Optional - URL to loader file. Defaults to loader.json in workspace --list-samples List samples ingested into the workspace and exit - --list-contigs List contigs for the ingested samples in the workspace and exit - --list-partitions List interval partitions for the ingested samples in the workspace and exit + --list-contigs List contigs configured in vid mapping for the workspace and exit + --list-fields List genomic fields configured in vid mapping for the workspace and exit + --list-partitions List interval partitions(genomicsdb arrays in the workspace) for the given intervals(-i/--interval or -I/--interval-list) or all the intervals for the workspace and exit -i INTERVAL, --interval INTERVAL genomic intervals over which to operate. The intervals should be specified in the :- format with START and END optional. This argument may be specified 0 or more times e.g -i chr1:1-10000 -i chr2 -i chr3:1000. @@ -49,8 +48,14 @@ options: Note: 1. -s/--sample and -S/--sample-list are mutually exclusive 2. either samples and/or intervals using -i/-I/-s/-S options has to be specified + -a ATTRIBUTES, --attributes ATTRIBUTES + Optional - comma separated list of genomic attributes or fields described in the vid mapping for the query, eg. GT,AC,PL,DP... Defaults to GT -f FILTER, --filter FILTER Optional - genomic filter expression for the query, e.g. 'ISHOMREF' or 'ISHET' or 'REF == "G" && resolve(GT, REF, ALT) &= "T/T" && ALT |= "T"' + -n NPROC, --nproc NPROC + Optional - number of processing units for multiprocessing(default: 8). Run nproc from command line to print the number of processing units available to a process for the user + --chunk-size CHUNK_SIZE + Optional - hint to split number of samples for multiprocessing used in conjunction with -n/--nproc and when -s/-S/--sample/--sample-list is not specified (default: 10240) -t {csv,json,arrow}, --output-type {csv,json,arrow} Optional - specify type of output for the query (default: csv) -j {all,all-by-calls,samples-with-num-calls,samples,num-calls}, --json-output-type {all,all-by-calls,samples-with-num-calls,samples,num-calls} @@ -58,7 +63,10 @@ options: -z MAX_ARROW_BYTE_SIZE, --max-arrow-byte-size MAX_ARROW_BYTE_SIZE Optional - used in conjunction with -t/--output-type arrow as hint for buffering parquet files(default: 64MB) -o OUTPUT, --output OUTPUT - a prefix filename to csv outputs from the tool. The filenames will be suffixed with the interval and .csv/.json (default: query_output) + a prefix filename to outputs from the tool. The filenames will be suffixed with the interval and .csv/.json/... (default: query_output) + -d, --dryrun displays the query that will be run without actually executing the query (default: False) + -b, --bypass-intersecting-intervals-phase + iterate only once bypassing the intersecting intervals phase (default: False) ``` Run `genomicsdb_query` with the -w and --list-samples/--list-contigs to figure out legitimate samples and contigs over which the query can operate. These can be used with the --samples/--intervals options later to run the actual query. @@ -97,5 +105,40 @@ query_output_1-100-100000.csv query_output_1-100001.csv query_output_2.csv ``` +### Caching for enhanced performance + +Locally caching artifacts from cloud URLs is optional for GenomicsDB metadata and helps with performance for metadata/artifacts which can be accessed multiple times. There is a separate caching tool `genomicsdb_cache` which takes as inputs the workspace, optionally callset/vidmap/loader.json and also optionally the intervals or intervals with the -i/--interval/-I/--interval-list option. Note that the json files are downloaded to the current working directory whereas other metadata are persisted in `$TMPDIR` or in `/tmp`. This is envisioned to be done once before the first start of the queries for the interval. Set the env variable `TILEDB_CACHE` to `1` and explicitly use `-c callset.json -v vidmap.json -l loader.json` with the `genomicsdb_query` command to access locally cached GenomicsDB metadata and json artifacts. + +``` +~/GenomicsDB-Python/examples: ./genomicsdb_cache -h +usage: cache [options] + +Cache GenomicsDB metadata and generated callset/vidmap/loader json artifacts for workspace cloud URLs + +options: + -h, --help show this help message and exit + --version print GenomicsDB native library version and exit + -w WORKSPACE, --workspace WORKSPACE + URL to GenomicsDB workspace + e.g. -w my_workspace or -w az://my_container/my_workspace or -w s3://my_bucket/my_workspace or -w gs://my_bucket/my_workspace + -v VIDMAP, --vidmap VIDMAP + Optional - URL to vid mapping file. Defaults to vidmap.json in workspace + -c CALLSET, --callset CALLSET + Optional - URL to callset mapping file. Defaults to callset.json in workspace + -l LOADER, --loader LOADER + Optional - URL to loader file. Defaults to loader.json in workspace + -i INTERVAL, --interval INTERVAL + Optional - genomic intervals over which to operate. The intervals should be specified in the :- format with START and END optional. + This argument may be specified 0 or more times e.g -i chr1:1-10000 -i chr2 -i chr3:1000. + Note: + 1. -i/--interval and -I/--interval-list are mutually exclusive + 2. either samples and/or intervals using -i/-I/-s/-S options has to be specified + -I INTERVAL_LIST, --interval-list INTERVAL_LIST + Optional - genomic intervals listed in a file over which to operate. + The intervals should be specified in the :- format, with START and END optional one interval per line. + Note: + 1. -i/--interval and -I/--interval-list are mutually exclusive + 2. either samples and/or intervals using -i/-I/-s/-S options has to be specified +``` + -For ease of use, open run.sh and change the `WORKSPACE`, `INTERVALS` and other commented out variables to what is desired before invoking it. Variables `VIDMAP_FILE` and `LOADER_FILE` need to be set only if they are not in the workspace. run.sh calls genomicsdb_query, the tool does the querying of the workspace for the intervals specified and outputs one csv file per input interval. diff --git a/examples/genomicsdb_cache b/examples/genomicsdb_cache index 1454f63..2f57f86 100755 --- a/examples/genomicsdb_cache +++ b/examples/genomicsdb_cache @@ -49,7 +49,7 @@ def get_arrays(interval, contigs_map, partitions): def main(): parser = argparse.ArgumentParser( prog="cache", - description="Cache GenomicsDB generated json artifacts for workspace cloud URLs", + description="Cache GenomicsDB metadata and generated callset/vidmap/loader json artifacts for workspace cloud URLs", # noqa formatter_class=argparse.RawTextHelpFormatter, usage="%(prog)s [options]", ) @@ -86,13 +86,13 @@ def main(): "--interval", action="append", required=False, - help="genomic intervals over which to operate. The intervals should be specified in the :- format with START and END optional.\nThis argument may be specified 0 or more times e.g -i chr1:1-10000 -i chr2 -i chr3:1000. \nNote: \n\t1. -i/--interval and -I/--interval-list are mutually exclusive \n\t2. either samples and/or intervals using -i/-I/-s/-S options has to be specified", # noqa + help="Optional - genomic intervals over which to operate. The intervals should be specified in the :- format with START and END optional.\nThis argument may be specified 0 or more times e.g -i chr1:1-10000 -i chr2 -i chr3:1000. \nNote: \n\t1. -i/--interval and -I/--interval-list are mutually exclusive \n\t2. either samples and/or intervals using -i/-I/-s/-S options has to be specified", # noqa ) parser.add_argument( "-I", "--interval-list", required=False, - help="genomic intervals listed in a file over which to operate.\nThe intervals should be specified in the :- format, with START and END optional one interval per line. \nNote: \n\t1. -i/--interval and -I/--interval-list are mutually exclusive \n\t2. either samples and/or intervals using -i/-I/-s/-S options has to be specified", # noqa + help="Optional - genomic intervals listed in a file over which to operate.\nThe intervals should be specified in the :- format, with START and END optional one interval per line. \nNote: \n\t1. -i/--interval and -I/--interval-list are mutually exclusive \n\t2. either samples and/or intervals using -i/-I/-s/-S options has to be specified", # noqa ) args = parser.parse_args() diff --git a/examples/run.sh b/examples/run.sh deleted file mode 100755 index da38252..0000000 --- a/examples/run.sh +++ /dev/null @@ -1,126 +0,0 @@ -#!/bin/bash -# -# query script -# -# The MIT License -# -# Copyright (c) 2024 dātma, inc™ -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# -# - -set -e - -export WORKSPACE=${WORKSPACE:-my_workspace} -export CALLSET_FILE=${CALLSET_FILE:-$WORKSPACE/callset.json} -export VIDMAP_FILE=${VIDMAP_FILE:-$WORKSPACE/vidmap.json} -export LOADER_FILE=${LOADER_FILE:-$WORKSPACE/loader.json} - -declare -a INTERVALS -INTERVALS=("1:1-1000000") -#INTERVALS=("1:1-1000000" "1:1000001-2000000" "1:2000001-3137454") -#INTERVALS=("chr1:1-200000") -#INTERVALS=("1:1-3137454") -#INTERVALS=("1:1-40000000") -#INTERVALS=("1:1-4000000" "1:8000001-16000000" "1:16000001-24000000" "1:24000001-32000000" "1:32000001-40000000" "2:3000" "3") -#INTERVALS=("1:1-12549816") -#INTERVALS=("1:1-3137454" "1:3137455-6274908" "1:6274909-9412362" "1:9412363-12549816") - -#declare -a SAMPLES -#SAMPLES=("HG00096" "HG00097" "HG00099") -#SAMPLES_LIST=samples.list - -#FILTER='resolve(GT, REF, ALT) &= "T/T"' -FILTER='!ISHOMREF' - -export OUTPUT_FILE=${OUTPUT_FILE:-my_output} -export OUTPUT_FILE_TYPE=${OUTPUT_FILE_TYPE:-json} - -export TILEDB_CACHE=1 -NTHREADS=${NTHREADS:-8} - -VENV=${VENV:-env} - -########################################### -# Should not have to change anything below -########################################### - -if [[ ! -d $VENV ]]; then - python3 -m venv $VENV - source env/bin/activate - pip install genomicsdb -else - source $VENV/bin/activate -fi - -PATH=$(dirname $0):$PATH - -if [[ ! -z ${SAMPLES} ]]; then - for SAMPLE in "${SAMPLES[@]}" - do - SAMPLE_ARGS="$SAMPLE_ARGS -s $SAMPLE" - done -fi - -if [[ ! -z $SAMPLES_LIST ]]; then - export SAMPLE_ARGS="-S $SAMPLE_LIST" -fi - -if [[ ! -z ${FILTER} ]]; then - export FILTER_EXPR="-f $FILTER" -fi - -echo $LOADER_FILE $CALLSET_FILE $VIDMAP_FILE - -rm -f loader.json callset.json vidmap.json -for INTERVAL in "${INTERVALS[@]}" -do - INTERVAL_LIST="$INTERVAL_LIST -i $INTERVAL" -done - -genomicsdb_cache -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE $INTERVAL_LIST - -if [[ -f loader.json ]]; then - export LOADER_FILE="loader.json" -fi -if [[ -f callset.json ]]; then - export CALLSET_FILE="callset.json" -fi -if [[ -f vidmap.json ]]; then - export VIDMAP_FILE="vidmap.json" -fi - -if [[ $(uname) == "Darwin" ]]; then - export MEASURE_PERFORMANCE="/usr/bin/time -l" -else - export MEASURE_PERFORMANCE="/usr/bin/time -v" -fi - -run_query() { - INTERVAL=$1 - OUTPUT_FILE=$2 - echo genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE - $MEASURE_PERFORMANCE genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE -} - -export -f run_query -parallel -j${NTHREADS} run_query {} $OUTPUT_FILE ::: ${INTERVALS[@]} - -deactivate