From a3c577adf9b4c2c9bdde421071f8b8b43dd31816 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Fri, 17 Jan 2025 15:54:12 -0800 Subject: [PATCH] Filter out duplicate intervals/samples specified in the command line --- examples/genomicsdb_common.py | 2 +- examples/test.sh | 38 +++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/examples/genomicsdb_common.py b/examples/genomicsdb_common.py index cdc5ea8..9d2e540 100644 --- a/examples/genomicsdb_common.py +++ b/examples/genomicsdb_common.py @@ -74,7 +74,7 @@ def parse_vidmap_json(vidmap_file, intervals=None): column_offset += contig_elem["length"] if all_intervals: intervals.append(contig_name) - return contigs_map, intervals + return contigs_map, set(intervals) def parse_interval(interval: str): diff --git a/examples/test.sh b/examples/test.sh index 2c7e8dc..278de15 100755 --- a/examples/test.sh +++ b/examples/test.sh @@ -72,6 +72,7 @@ die() { # any other value if the command should return a failure run_command() { echo $EMPTY > $TEMP_DIR/output + rm -fr ${OUTPUT}* declare -i EXPECT_NZ declare -i GOT_NZ EXPECT_NZ=0 @@ -93,6 +94,39 @@ run_command() { fi } +check_command_with_duplicates() { + # First argument should be the command + run_command "$1" + nFiles=$(ls -l ${OUTPUT}*.csv | wc -l) + # Second argument should be the number of files + if [[ nFiles -ne $2 ]]; then + die "Could not find the required number of files for '$cmd'" + fi + IFS=' ' read -r -a FILES <<< $3 + for FILE in "${FILES[@]}" + do + if [[ ! -f ${OUTPUT}_${FILE}.csv ]]; then + echo "Could not find file=${OUTPUT}_${FILE}.csv" + exit 1 + fi + nlines=$(wc -l < ${OUTPUT}_${FILE}.csv) + errmg="'${cmd}' does not return with the required number of lines for ${OUTPUT}_${FILE}.csv" + if [[ $FILE == "1" ]]; then + if [[ $1 == *"-s HG00141"* ]]; then + if [[ $nlines -ne 3 ]]; then + die $errmsg + fi + elif [[ $nlines -ne 5 ]]; then + die $errmsg + fi + else + if [[ $nlines -ne 2 ]]; then + die $errmsg + fi + fi + done +} + if [[ -z $WORKSPACE ]]; then tar xzf $(dirname $0)/examples_ws.tgz -C $TEMP_DIR WORKSPACE=$TEMP_DIR/ws @@ -195,6 +229,10 @@ run_command "genomicsdb_query -w $WORKSPACE -i 1 --chunk-size=2 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE -i 1 --chunk-size=2 -b -o $OUTPUT -d" run_command "genomicsdb_query -w $WORKSPACE -i 1 --chunk-size=2 -b -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE -i 4 --chunk-size=4 -b -o $OUTPUT -d" +# Duplicates +check_command_with_duplicates "genomicsdb_query -w $WORKSPACE -i 1 -i 1 --chunk-size=2 -o $OUTPUT" 2 "1 1_1" +check_command_with_duplicates "genomicsdb_query -w $WORKSPACE -i 1 -i 1 --chunk-size=2 -s HG00141 -s HG00141 -o $OUTPUT" 1 "1" + OLDSTYLE_JSONS="-l $OLDSTYLE_DIR/loader.json -c $OLDSTYLE_DIR/callset_t0_1_2.json -v $OLDSTYLE_DIR/vid.json" run_command "genomicsdb_cache -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS"