Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions SINGER/SINGER/convert_long_ARG.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,36 @@ def write_output_ts(ts, output_prefix, MCMC_iteration):
print(f"Save to {output_ts_filename}")
ts.dump(output_ts_filename)

def read_vcf_sample_names(vcf_file):
for path in (vcf_file, vcf_file + ".vcf"):
if not os.path.exists(path):
continue
with open(path) as f:
for line in f:
if line.startswith("#CHROM"):
return line.strip().split("\t")[9:]
raise FileNotFoundError(f"Could not find VCF file: {vcf_file}")

def add_individuals_from_vcf(ts, vcf_file):
sample_names = read_vcf_sample_names(vcf_file)
if len(sample_names) * 2 != ts.num_samples:
raise ValueError(
f"Expected {len(sample_names) * 2} sample nodes for {len(sample_names)} "
f"diploid VCF samples, got {ts.num_samples}."
)
tables = ts.dump_tables()
node_individual = tables.nodes.individual.copy()
node_metadata = [b""] * tables.nodes.num_rows
for ind_id, name in enumerate(sample_names):
tables.individuals.add_row(metadata=name.encode())
for hap in (0, 1):
nid = ind_id * 2 + hap
node_individual[nid] = ind_id
node_metadata[nid] = f"{name}_{hap}".encode()
tables.nodes.individual = node_individual
tables.nodes.packset_metadata(node_metadata)
return tables.tree_sequence()

def main():
# Argument parsing
parser = argparse.ArgumentParser(description="Generate tskit format for a long ARG.")
Expand Down
62 changes: 54 additions & 8 deletions SINGER/SINGER/merge_ARG.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,10 @@ def read_long_ARG(node_files, branch_files, mutation_files, block_coordinates):
tables.mutations.add_row(site=site_id, node=int(mutations[i, 1]) + node_num, derived_state=str(int(mutations[i, 3])))

tables.sort()
tables.build_index()
tables.compute_mutation_parents()
ts = tables.tree_sequence()

return ts

def load_file_lists(file_list_path):
Expand Down Expand Up @@ -121,35 +123,79 @@ def sort_nodes_by_time(ts):
child=node_map[edges.child].astype(np.int32),
)

muts = tables.mutations
muts.set_columns(
site=muts.site,
node=node_map[muts.node].astype(np.int32),
derived_state=muts.derived_state,
derived_state_offset=muts.derived_state_offset,
parent=muts.parent,
time=muts.time,
metadata=muts.metadata,
metadata_offset=muts.metadata_offset,
)

tables.sort()
tables.build_index()
tables.compute_mutation_parents()
tables.compute_mutation_times()
ts = tables.tree_sequence()
return tables.tree_sequence()

def write_output_ts(ts, output):
print(f"Save to {output}")
ts.dump(output)

def read_vcf_sample_names(vcf_file):
for path in (vcf_file, vcf_file + ".vcf"):
if not os.path.exists(path):
continue
with open(path) as f:
for line in f:
if line.startswith("#CHROM"):
return line.strip().split("\t")[9:]
raise FileNotFoundError(f"Could not find VCF file: {vcf_file}")

def add_individuals_from_vcf(ts, vcf_file):
sample_names = read_vcf_sample_names(vcf_file)
if len(sample_names) * 2 != ts.num_samples:
raise ValueError(
f"Expected {len(sample_names) * 2} sample nodes for {len(sample_names)} "
f"diploid VCF samples, got {ts.num_samples}."
)
tables = ts.dump_tables()
node_individual = tables.nodes.individual.copy()
node_metadata = [b""] * tables.nodes.num_rows
for ind_id, name in enumerate(sample_names):
tables.individuals.add_row(metadata=name.encode())
for hap in (0, 1):
nid = ind_id * 2 + hap
node_individual[nid] = ind_id
node_metadata[nid] = f"{name}_{hap}".encode()
tables.nodes.individual = node_individual
tables.nodes.packset_metadata(node_metadata)
return tables.tree_sequence()

def main():
# Argument parsing
parser = argparse.ArgumentParser(description="Generate tskit format for a long ARG.")

# Add arguments with prefixes
parser = argparse.ArgumentParser(description="Generate tskit format for a long ARG using file list.")
parser.add_argument("--file_table", required=True, help="Sub file table")
parser.add_argument("--output", required=True, help="Output file name")

parser.add_argument("--output", required=True, help="Output file name")
parser.add_argument("--vcf", required=False, default=None,
help="VCF (or prefix without .vcf) used as SINGER input. Sample "
"names from the header are attached as individuals, and tips "
"are named <sample>_0 / <sample>_1.")

args = parser.parse_args()

# Generate file lists
node_files, branch_files, mutation_files, block_coordinates = load_file_lists(args.file_table)
# Apply the function
output_ts_filename = args.output
ts = read_long_ARG(node_files, branch_files, mutation_files, block_coordinates)
sorted_ts = sort_nodes_by_time(ts)
write_output_ts(sorted_ts, args.output)
if args.vcf is not None:
sorted_ts = add_individuals_from_vcf(sorted_ts, args.vcf)
write_output_ts(sorted_ts, args.output)

if __name__ == "__main__":
main()