From 361ce9ddf27b1dc44bd512a98cf5c4acd2df8caa Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Mon, 27 Jan 2025 08:40:00 -0800 Subject: [PATCH 1/8] Support listing descriptions with the --list-fields option if the template header files are present in the workspace --- genomicsdb/scripts/genomicsdb_query.py | 62 ++++++++++++++++++++++---- test/scripts/test.sh | 2 + 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/genomicsdb/scripts/genomicsdb_query.py b/genomicsdb/scripts/genomicsdb_query.py index ef59c46..66131b3 100644 --- a/genomicsdb/scripts/genomicsdb_query.py +++ b/genomicsdb/scripts/genomicsdb_query.py @@ -116,7 +116,7 @@ def parse_callset_json_for_split_row_ranges(callset_file, chunk_size): return split_row_ranges -def print_fields(key, val): +def print_fields(key, val, descriptions): if "vcf_field_class" not in val: val["vcf_field_class"] = ["FILTER"] if "length" not in val: @@ -148,19 +148,64 @@ def print_fields(key, val): field_type = "String" else: field_type = "Char" - print(f"{key:<20} {field_class:10} {field_type:10} {field_length}") + tuple_index = (key, field_class) + if descriptions and tuple_index in descriptions: + print(f"{key:<20} {field_class:10} {field_type:10} {field_length:10} {descriptions[tuple_index]}") + else: + print(f"{key:<20} {field_class:10} {field_type:10} {field_length}") -def parse_vidmap_json_and_print_fields(vidmap_file): +def parse_template_header_file(template_header_file): + if not genomicsdb.is_file(template_header_file): + return None + header_contents = genomicsdb.read_entire_file(template_header_file) + start = 0 + template_header_fields = {} + while True: + end = header_contents.find("\n", start) + if end == -1: + break + line = header_contents[start:end] + if line.startswith("##INFO") or line.startswith("##FORMAT") or line.startswith("##FILTER"): + try: + line = line[2:] + field_start = line.find("=<") + field_end = line.find(">") + field_type = line[0:field_start] + fields=line[field_start+2:field_end].split(",") + field_id = None + field_description = None + for field in fields: + val = field.find("ID=") + if val == 0: + field_id = field[val+len("ID="):] + else: + val = field.find("Description=") + if val == 0: + field_description = field[val+len("Description="):] + if field_id and field_description: + template_header_fields[(field_id, field_type)] = field_description + except Exception as e: + logging.error(f"Exception={e} when processing {template_header_file} for line={line}") + start = end + 1 + return template_header_fields + + +def parse_and_print_fields(vidmap_file, template_header_file): + descriptions = parse_template_header_file(template_header_file) vidmap = json.loads(genomicsdb.read_entire_file(vidmap_file)) fields = vidmap["fields"] - print(f"{'Field':20} {'Class':10} {'Type':10} {'Length'}") - print(f"{'-----':20} {'-----':10} {'----':10} {'------'}") + if descriptions: + print(f"{'Field':20} {'Class':10} {'Type':10} {'Length':10} {'Description'}") + print(f"{'-----':20} {'-----':10} {'----':10} {'------':10} {'-----------'}") + else: + print(f"{'Field':20} {'Class':10} {'Type':10} {'Length'}") + print(f"{'-----':20} {'-----':10} {'----':10} {'------'}") if isinstance(fields, list): - {print_fields(field["name"], field) for field in fields} + {print_fields(field["name"], field, descriptions) for field in fields} else: # Old style vidmap json for key, val in fields.items(): - print_fields(key, val) + print_fields(key, val, descriptions) # See https://github.com/GenomicsDB/GenomicsDB/wiki/Importing-VCF-data-into-GenomicsDB # for description of lengths in vid mapping files abbreviations = { @@ -379,7 +424,8 @@ def setup(): # List fields if args.list_fields: - parse_vidmap_json_and_print_fields(vidmap_file) + template_header_file = workspace + "/vcfheader.vcf" + parse_and_print_fields(vidmap_file, template_header_file) sys.exit(0) intervals = args.interval diff --git a/test/scripts/test.sh b/test/scripts/test.sh index 905da13..26d7b03 100755 --- a/test/scripts/test.sh +++ b/test/scripts/test.sh @@ -244,5 +244,7 @@ run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS --lis run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS -S $TEMP_DIR/samples.list -a GT -o $OUTPUT" +rm $WORKSPACE/vcfheader.vcf +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-fields" cleanup From 77d0efbbce61a779bac0263b60557b41e5be8889 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Mon, 27 Jan 2025 08:42:34 -0800 Subject: [PATCH 2/8] Lint --- genomicsdb/scripts/genomicsdb_query.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/genomicsdb/scripts/genomicsdb_query.py b/genomicsdb/scripts/genomicsdb_query.py index 66131b3..62e82aa 100644 --- a/genomicsdb/scripts/genomicsdb_query.py +++ b/genomicsdb/scripts/genomicsdb_query.py @@ -172,17 +172,17 @@ def parse_template_header_file(template_header_file): field_start = line.find("=<") field_end = line.find(">") field_type = line[0:field_start] - fields=line[field_start+2:field_end].split(",") + fields = line[field_start + 2 : field_end].split(",") field_id = None field_description = None for field in fields: val = field.find("ID=") if val == 0: - field_id = field[val+len("ID="):] + field_id = field[val + len("ID=") :] else: val = field.find("Description=") if val == 0: - field_description = field[val+len("Description="):] + field_description = field[val + len("Description=") :] if field_id and field_description: template_header_fields[(field_id, field_type)] = field_description except Exception as e: From ac7f4b5a27aa520358d1ce620d63e8e559f44f07 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Wed, 29 Jan 2025 15:04:15 -0800 Subject: [PATCH 3/8] Make default attributes for consideration as REF,GT and add --no-cache an option to the script --- genomicsdb/scripts/genomicsdb_common.py | 10 +++- genomicsdb/scripts/genomicsdb_query.py | 61 +++++++++++++++++-------- 2 files changed, 52 insertions(+), 19 deletions(-) diff --git a/genomicsdb/scripts/genomicsdb_common.py b/genomicsdb/scripts/genomicsdb_common.py index 9d2e540..b54820d 100644 --- a/genomicsdb/scripts/genomicsdb_common.py +++ b/genomicsdb/scripts/genomicsdb_common.py @@ -34,10 +34,18 @@ def normalize_path(path): if "://" in path: - return path + if path.endswith("/"): + return path[0:len(path)-2] + else: + return path else: + # os.path.abspath removes trailing slash return os.path.abspath(path) +def join_paths(path1, path2): + return path1+"/"+path2 + + def parse_vidmap_json(vidmap_file, intervals=None): if isinstance(intervals, str): diff --git a/genomicsdb/scripts/genomicsdb_query.py b/genomicsdb/scripts/genomicsdb_query.py index 62e82aa..b3584ed 100644 --- a/genomicsdb/scripts/genomicsdb_query.py +++ b/genomicsdb/scripts/genomicsdb_query.py @@ -222,19 +222,23 @@ def parse_and_print_fields(vidmap_file, template_header_file): def parse_vidmap_json_for_attributes(vidmap_file, attributes=None): if attributes is None or len(attributes) == 0: - return ["GT"] - else: - vidmap = json.loads(genomicsdb.read_entire_file(vidmap_file)) - fields = vidmap["fields"] - if isinstance(fields, list): - fields = [field["name"] for field in fields] - else: # Old style vidmap json - fields = fields.keys() - attributes = attributes.replace(" ", "").split(",") - not_found = [attribute for attribute in attributes if attribute not in fields] - if len(not_found) > 0: - raise RuntimeError(f"Attributes({not_found}) not found in vid mapping({vidmap_file})") - return attributes + # Default + return ["REF", "GT"] + + vidmap = json.loads(genomicsdb.read_entire_file(vidmap_file)) + fields = vidmap["fields"] + if isinstance(fields, list): + fields = [field["name"] for field in fields] + else: # Old style vidmap json + fields = fields.keys() + fields = set(fields) + fields.add("REF") + fields.add("ALT") + attributes = attributes.replace(" ", "").split(",") + not_found = [attribute for attribute in attributes if attribute not in fields] + if len(not_found) > 0: + raise RuntimeError(f"Attributes({not_found}) not found in vid mapping({vidmap_file})") + return attributes def parse_loader_json(loader_file, interval_form=True): @@ -306,6 +310,11 @@ def setup(): action="store_true", help="List interval partitions(genomicsdb arrays in the workspace) for the given intervals(-i/--interval or -I/--interval-list) or all the intervals for the workspace and exit", # noqa ) + parser.add_argument( + "--no-cache", + action="store_true", + help="Do not use cached metadata and files with the genomicsdb query", + ) parser.add_argument( "-i", "--interval", @@ -336,7 +345,7 @@ def setup(): "-a", "--attributes", required=False, - help="Optional - comma separated list of genomic attributes or fields described in the vid mapping for the query, eg. GT,AC,PL,DP... Defaults to GT", # noqa + help="Optional - comma separated list of genomic attributes(REF, ALT) and fields described in the vid mapping for the query, eg. GT,AC,PL,DP... Defaults to REF,GT", # noqa ) parser.add_argument( "-f", @@ -402,13 +411,22 @@ def setup(): raise RuntimeError(f"workspace({workspace}) not found") callset_file = args.callset if not callset_file: - callset_file = workspace + "/callset.json" + if not args.no_cache and genomicsdb.is_file("callset.json"): + callset_file = "callset.json" + else: + callset_file = genomicsdb_common.join_paths(workspace, "callset.json") vidmap_file = args.vidmap if not vidmap_file: - vidmap_file = workspace + "/vidmap.json" + if not args.no_cache and genomicsdb.is_file("vidmap.json"): + vidmap_file = "vidmap.json" + else: + vidmap_file = genomicsdb_common.join_paths(workspace, "vidmap.json") loader_file = args.loader if not loader_file: - loader_file = workspace + "/loader.json" + if not args.no_cache and genomicsdb.is_file("loader.json"): + loader_file = "loader.json" + else: + loader_file = genomicsdb_common.join_paths(workspace, "loader.json") if ( not genomicsdb.is_file(callset_file) or not genomicsdb.is_file(vidmap_file) @@ -466,6 +484,11 @@ def setup(): row_tuples = parse_callset_json_for_row_ranges(callset_file, samples or sample_list) attributes = parse_vidmap_json_for_attributes(vidmap_file, args.attributes) + if args.no_cache: + os.environ.pop("TILEDB_CACHE", None) + else: + os.environ["TILEDB_CACHE"] = "1" + return workspace, callset_file, vidmap_file, partitions, contigs_map, intervals, row_tuples, attributes, args @@ -588,6 +611,8 @@ def process(config): query_config = config.query_config output_config = config.output_config msg = f"array({query_config.array_name}) for interval({query_config.interval})" + if query_config.row_tuples: + msg += f" and rows({query_config.row_tuples})" if not genomicsdb.array_exists(export_config.workspace, query_config.array_name): logging.error(msg + f" not imported into workspace({export_config.workspace})") return -1 @@ -636,7 +661,7 @@ def process(config): gdb = None return -1 - logging.info(f"Processed array {query_config.array_name} for interval {query_config.interval}") + logging.info(f"Processed {msg}") return 0 From e039001a5a193d838bb4168b06893fae9ea8aa1d Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Wed, 29 Jan 2025 15:04:55 -0800 Subject: [PATCH 4/8] Lint --- genomicsdb/scripts/genomicsdb_common.py | 6 +++--- genomicsdb/scripts/genomicsdb_query.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/genomicsdb/scripts/genomicsdb_common.py b/genomicsdb/scripts/genomicsdb_common.py index b54820d..fef832d 100644 --- a/genomicsdb/scripts/genomicsdb_common.py +++ b/genomicsdb/scripts/genomicsdb_common.py @@ -35,16 +35,16 @@ def normalize_path(path): if "://" in path: if path.endswith("/"): - return path[0:len(path)-2] + return path[0 : len(path) - 2] else: return path else: # os.path.abspath removes trailing slash return os.path.abspath(path) -def join_paths(path1, path2): - return path1+"/"+path2 +def join_paths(path1, path2): + return path1 + "/" + path2 def parse_vidmap_json(vidmap_file, intervals=None): diff --git a/genomicsdb/scripts/genomicsdb_query.py b/genomicsdb/scripts/genomicsdb_query.py index b3584ed..e0bd571 100644 --- a/genomicsdb/scripts/genomicsdb_query.py +++ b/genomicsdb/scripts/genomicsdb_query.py @@ -314,7 +314,7 @@ def setup(): "--no-cache", action="store_true", help="Do not use cached metadata and files with the genomicsdb query", - ) + ) parser.add_argument( "-i", "--interval", From 186224948daf57564ce50b0dc46298d3b0347caf Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Wed, 29 Jan 2025 17:34:31 -0800 Subject: [PATCH 5/8] Check if workspace is in the cloud for locating cached json files --- genomicsdb/scripts/genomicsdb_cache.py | 2 +- genomicsdb/scripts/genomicsdb_query.py | 5 +++-- test/scripts/test.sh | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/genomicsdb/scripts/genomicsdb_cache.py b/genomicsdb/scripts/genomicsdb_cache.py index 03f4cd4..5e86fab 100644 --- a/genomicsdb/scripts/genomicsdb_cache.py +++ b/genomicsdb/scripts/genomicsdb_cache.py @@ -48,7 +48,7 @@ def get_arrays(interval, contigs_map, partitions): def main(): parser = argparse.ArgumentParser( prog="cache", - description="Cache GenomicsDB metadata and generated callset/vidmap/loader json artifacts for workspace cloud URLs", # noqa + description="Cache GenomicsDB metadata and generated callset/vidmap/loader json artifacts for workspace cloud URLs. The metadata is copied to TMPDIR and the json files to the current working directory", # noqa formatter_class=argparse.RawTextHelpFormatter, usage="%(prog)s [options]", ) diff --git a/genomicsdb/scripts/genomicsdb_query.py b/genomicsdb/scripts/genomicsdb_query.py index e0bd571..e12988d 100644 --- a/genomicsdb/scripts/genomicsdb_query.py +++ b/genomicsdb/scripts/genomicsdb_query.py @@ -407,17 +407,18 @@ def setup(): args = parser.parse_args() workspace = genomicsdb_common.normalize_path(args.workspace) + is_cloud_workspace = True if "://" in workspace else False if not genomicsdb.workspace_exists(workspace): raise RuntimeError(f"workspace({workspace}) not found") callset_file = args.callset if not callset_file: - if not args.no_cache and genomicsdb.is_file("callset.json"): + if is_cloud_workspace and not args.no_cache and genomicsdb.is_file("callset.json"): callset_file = "callset.json" else: callset_file = genomicsdb_common.join_paths(workspace, "callset.json") vidmap_file = args.vidmap if not vidmap_file: - if not args.no_cache and genomicsdb.is_file("vidmap.json"): + if is_cloud_workspace and not args.no_cache and genomicsdb.is_file("vidmap.json"): vidmap_file = "vidmap.json" else: vidmap_file = genomicsdb_common.join_paths(workspace, "vidmap.json") diff --git a/test/scripts/test.sh b/test/scripts/test.sh index 26d7b03..aeb7b19 100755 --- a/test/scripts/test.sh +++ b/test/scripts/test.sh @@ -232,7 +232,7 @@ run_command "genomicsdb_query -w $WORKSPACE -i 4 --chunk-size=4 -b -o $OUTPUT -d # Duplicates check_command_with_duplicates "genomicsdb_query -w $WORKSPACE -i 1 -i 1 --chunk-size=2 -o $OUTPUT" 2 "1 1_1" check_command_with_duplicates "genomicsdb_query -w $WORKSPACE -i 1 -i 1 --chunk-size=2 -s HG00141 -s HG00141 -o $OUTPUT" 1 "1" - +run_command "genomicsdb_query -w $WORKSPACE -i 4 --chunk-size=4 -b -o $OUTPUT --no-cache" OLDSTYLE_JSONS="-l $OLDSTYLE_DIR/loader.json -c $OLDSTYLE_DIR/callset_t0_1_2.json -v $OLDSTYLE_DIR/vid.json" run_command "genomicsdb_cache -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS" @@ -247,4 +247,5 @@ run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS -S $T rm $WORKSPACE/vcfheader.vcf run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-fields" + cleanup From dfa49b5813e550356b7c8025a64022d8614e5ff1 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Wed, 29 Jan 2025 22:24:27 -0800 Subject: [PATCH 6/8] Paths --- genomicsdb/scripts/genomicsdb_common.py | 11 +++++------ test/scripts/test.sh | 2 ++ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/genomicsdb/scripts/genomicsdb_common.py b/genomicsdb/scripts/genomicsdb_common.py index fef832d..9b330ee 100644 --- a/genomicsdb/scripts/genomicsdb_common.py +++ b/genomicsdb/scripts/genomicsdb_common.py @@ -34,17 +34,16 @@ def normalize_path(path): if "://" in path: - if path.endswith("/"): - return path[0 : len(path) - 2] - else: - return path + return path else: - # os.path.abspath removes trailing slash return os.path.abspath(path) def join_paths(path1, path2): - return path1 + "/" + path2 + if path1.endswith("/"): + return path1 + path2 + else: + return path1 + "/" + path2 def parse_vidmap_json(vidmap_file, intervals=None): diff --git a/test/scripts/test.sh b/test/scripts/test.sh index ca4718b..f7fed15 100755 --- a/test/scripts/test.sh +++ b/test/scripts/test.sh @@ -173,6 +173,8 @@ do done run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -o $OUTPUT" +run_command "genomicsdb_query -w ${WORKSPACE}/ -I $TEMP_DIR/contigs.list -s HG00096 -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -s NON_EXISTENT_SAMPLE -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s NON_EXISTENT_SAMPLE -o $OUTPUT" From b516cb6372a36d4aaea48e722e0f643e362ec345 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Thu, 30 Jan 2025 09:23:16 -0800 Subject: [PATCH 7/8] Address PR comment --- genomicsdb/scripts/genomicsdb_common.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/genomicsdb/scripts/genomicsdb_common.py b/genomicsdb/scripts/genomicsdb_common.py index 9b330ee..981d13a 100644 --- a/genomicsdb/scripts/genomicsdb_common.py +++ b/genomicsdb/scripts/genomicsdb_common.py @@ -28,6 +28,7 @@ import logging import os import sys +from urllib.parse import urljoin import genomicsdb @@ -40,10 +41,10 @@ def normalize_path(path): def join_paths(path1, path2): - if path1.endswith("/"): - return path1 + path2 + if "://" in path1: + return urljoin(path1, path2) else: - return path1 + "/" + path2 + return os.path.join(path1, path2) def parse_vidmap_json(vidmap_file, intervals=None): From c0e1a77e3ad694d3e542c88ed2b17d05f6cefad1 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Thu, 30 Jan 2025 10:16:02 -0800 Subject: [PATCH 8/8] urllib.parse works only with http URLs, not any URI --- genomicsdb/scripts/genomicsdb_common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/genomicsdb/scripts/genomicsdb_common.py b/genomicsdb/scripts/genomicsdb_common.py index 981d13a..3357df1 100644 --- a/genomicsdb/scripts/genomicsdb_common.py +++ b/genomicsdb/scripts/genomicsdb_common.py @@ -28,7 +28,6 @@ import logging import os import sys -from urllib.parse import urljoin import genomicsdb @@ -42,7 +41,10 @@ def normalize_path(path): def join_paths(path1, path2): if "://" in path1: - return urljoin(path1, path2) + if path1.endswith("/"): + return path1 + path2 + else: + return path1 + "/" + path2 else: return os.path.join(path1, path2)