Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion genomicsdb/scripts/genomicsdb_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def get_arrays(interval, contigs_map, partitions):
def main():
parser = argparse.ArgumentParser(
prog="cache",
description="Cache GenomicsDB metadata and generated callset/vidmap/loader json artifacts for workspace cloud URLs", # noqa
description="Cache GenomicsDB metadata and generated callset/vidmap/loader json artifacts for workspace cloud URLs. The metadata is copied to TMPDIR and the json files to the current working directory", # noqa
formatter_class=argparse.RawTextHelpFormatter,
usage="%(prog)s [options]",
)
Expand Down
10 changes: 10 additions & 0 deletions genomicsdb/scripts/genomicsdb_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@ def normalize_path(path):
return os.path.abspath(path)


def join_paths(path1, path2):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can just use os.path.join instead of this function I think

if "://" in path1:
if path1.endswith("/"):
return path1 + path2
else:
return path1 + "/" + path2
else:
return os.path.join(path1, path2)


def parse_vidmap_json(vidmap_file, intervals=None):
if isinstance(intervals, str):
is_file = True
Expand Down
62 changes: 44 additions & 18 deletions genomicsdb/scripts/genomicsdb_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,19 +222,23 @@ def parse_and_print_fields(vidmap_file, template_header_file):

def parse_vidmap_json_for_attributes(vidmap_file, attributes=None):
if attributes is None or len(attributes) == 0:
return ["GT"]
else:
vidmap = json.loads(genomicsdb.read_entire_file(vidmap_file))
fields = vidmap["fields"]
if isinstance(fields, list):
fields = [field["name"] for field in fields]
else: # Old style vidmap json
fields = fields.keys()
attributes = attributes.replace(" ", "").split(",")
not_found = [attribute for attribute in attributes if attribute not in fields]
if len(not_found) > 0:
raise RuntimeError(f"Attributes({not_found}) not found in vid mapping({vidmap_file})")
return attributes
# Default
return ["REF", "GT"]

vidmap = json.loads(genomicsdb.read_entire_file(vidmap_file))
fields = vidmap["fields"]
if isinstance(fields, list):
fields = [field["name"] for field in fields]
else: # Old style vidmap json
fields = fields.keys()
fields = set(fields)
fields.add("REF")
fields.add("ALT")
attributes = attributes.replace(" ", "").split(",")
not_found = [attribute for attribute in attributes if attribute not in fields]
if len(not_found) > 0:
raise RuntimeError(f"Attributes({not_found}) not found in vid mapping({vidmap_file})")
return attributes


def parse_loader_json(loader_file, interval_form=True):
Expand Down Expand Up @@ -306,6 +310,11 @@ def setup():
action="store_true",
help="List interval partitions(genomicsdb arrays in the workspace) for the given intervals(-i/--interval or -I/--interval-list) or all the intervals for the workspace and exit", # noqa
)
parser.add_argument(
"--no-cache",
action="store_true",
help="Do not use cached metadata and files with the genomicsdb query",
)
parser.add_argument(
"-i",
"--interval",
Expand Down Expand Up @@ -336,7 +345,7 @@ def setup():
"-a",
"--attributes",
required=False,
help="Optional - comma separated list of genomic attributes or fields described in the vid mapping for the query, eg. GT,AC,PL,DP... Defaults to GT", # noqa
help="Optional - comma separated list of genomic attributes(REF, ALT) and fields described in the vid mapping for the query, eg. GT,AC,PL,DP... Defaults to REF,GT", # noqa
)
parser.add_argument(
"-f",
Expand Down Expand Up @@ -398,17 +407,27 @@ def setup():
args = parser.parse_args()

workspace = genomicsdb_common.normalize_path(args.workspace)
is_cloud_workspace = True if "://" in workspace else False
if not genomicsdb.workspace_exists(workspace):
raise RuntimeError(f"workspace({workspace}) not found")
callset_file = args.callset
if not callset_file:
callset_file = workspace + "/callset.json"
if is_cloud_workspace and not args.no_cache and genomicsdb.is_file("callset.json"):
callset_file = "callset.json"
else:
callset_file = genomicsdb_common.join_paths(workspace, "callset.json")
vidmap_file = args.vidmap
if not vidmap_file:
vidmap_file = workspace + "/vidmap.json"
if is_cloud_workspace and not args.no_cache and genomicsdb.is_file("vidmap.json"):
vidmap_file = "vidmap.json"
else:
vidmap_file = genomicsdb_common.join_paths(workspace, "vidmap.json")
loader_file = args.loader
if not loader_file:
loader_file = workspace + "/loader.json"
if not args.no_cache and genomicsdb.is_file("loader.json"):
loader_file = "loader.json"
else:
loader_file = genomicsdb_common.join_paths(workspace, "loader.json")
if (
not genomicsdb.is_file(callset_file)
or not genomicsdb.is_file(vidmap_file)
Expand Down Expand Up @@ -466,6 +485,11 @@ def setup():
row_tuples = parse_callset_json_for_row_ranges(callset_file, samples or sample_list)
attributes = parse_vidmap_json_for_attributes(vidmap_file, args.attributes)

if args.no_cache:
os.environ.pop("TILEDB_CACHE", None)
else:
os.environ["TILEDB_CACHE"] = "1"

return workspace, callset_file, vidmap_file, partitions, contigs_map, intervals, row_tuples, attributes, args


Expand Down Expand Up @@ -588,6 +612,8 @@ def process(config):
query_config = config.query_config
output_config = config.output_config
msg = f"array({query_config.array_name}) for interval({query_config.interval})"
if query_config.row_tuples:
msg += f" and rows({query_config.row_tuples})"
if not genomicsdb.array_exists(export_config.workspace, query_config.array_name):
logging.error(msg + f" not imported into workspace({export_config.workspace})")
return -1
Expand Down Expand Up @@ -636,7 +662,7 @@ def process(config):
gdb = None
return -1

logging.info(f"Processed array {query_config.array_name} for interval {query_config.interval}")
logging.info(f"Processed {msg}")
return 0


Expand Down
4 changes: 3 additions & 1 deletion test/scripts/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,8 @@ do
done

run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -o $OUTPUT"
run_command "genomicsdb_query -w ${WORKSPACE}/ -I $TEMP_DIR/contigs.list -s HG00096 -o $OUTPUT"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -s NON_EXISTENT_SAMPLE -o $OUTPUT"
run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s NON_EXISTENT_SAMPLE -o $OUTPUT"
Expand Down Expand Up @@ -232,7 +234,7 @@ run_command "genomicsdb_query -w $WORKSPACE -i 4 --chunk-size=4 -b -o $OUTPUT -d
# Duplicates
check_command_with_duplicates "genomicsdb_query -w $WORKSPACE -i 1 -i 1 --chunk-size=2 -o $OUTPUT" 2 "1 1_1"
check_command_with_duplicates "genomicsdb_query -w $WORKSPACE -i 1 -i 1 --chunk-size=2 -s HG00141 -s HG00141 -o $OUTPUT" 1 "1"

run_command "genomicsdb_query -w $WORKSPACE -i 4 --chunk-size=4 -b -o $OUTPUT --no-cache"

OLDSTYLE_JSONS="-l $OLDSTYLE_DIR/loader.json -c $OLDSTYLE_DIR/callset_t0_1_2.json -v $OLDSTYLE_DIR/vid.json"
run_command "genomicsdb_cache -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS"
Expand Down