mutation_analysis/call_mutations.py at main · markhilt/mutation_analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
call_mutations.py
Author: Markus Hiltunen
E-mail: markus.hiltunen@su.se

This script is used to filter out background variants and retain potentially
new mutations in samples originating from different tissues of the same individual.
It takes as input a variant table produced by GATK variantsToTable.
The principle is to look for variants that are present
in a subset of tissues. Variants that conform to this principle can be optionally
further filtered, as there might be many false positives left, depending on
how the user filtered the inital vcf file.

Copyright (c) 2023, Johannesson lab
Licensed under the MIT license. See LICENSE file.
"""

from operator import truediv # To be able to divide lists
from scipy import stats as ss
import argparse

__version__ = "0.6"

parser = argparse.ArgumentParser(description='Filter variants to find potential \
                                            mutations. Input: variant table from \
                                            GATK VariantsToTable. Should be of the \
                                            following format: \
                                            "gatk VariantsToTable -V $VCF -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -GF AD -O $OUTPUT" \
                                            data for each sample start at the 6th. \
                                            The AD field in the table should be comma separated \
                                            and contain the number of reads with reference \
                                            allele and number of reads with alternate allele, \
                                            e.g. "23,21".')
parser.add_argument("input", \
                    help="Input table.", \
                    type = str)
parser.add_argument("-o","--output", \
                    help="Output prefix.", \
                    type = str)
parser.add_argument("-c","--coverage_file", \
                    help="Tab separated file containing each sample in the \
                        input table in the first column, and the median coverage \
                        for that sample in the second.", \
                    type = str)
parser.add_argument("-n","--minimum_coverage", \
                    help="Minimum coverage to keep a variant [20].", \
                    type = int, \
                    default = 20)
parser.add_argument("-x","--maximum_coverage", \
                    help="Maximum coverage to keep a variant [60].", \
                    type = int, \
                    default = 60)
parser.add_argument("-a","--coverage_factor", \
                    help="Allow up to this much deviation from the median \
                        coverage defined in the file given by -c. Higher => less \
                        conservative [0.2].", \
                    type = float, \
                    default = 0.2)
parser.add_argument("-r","--minimum_reads", \
                    help="Minimum number of reads supporting a variant to keep it. At least \
                    one sample needs to have at least this many reads supporting the variant [2].", \
                    type = int, \
                    default = 2)
parser.add_argument("-f","--frequency_cutoff", \
                    help="Minimum variant read frequency to keep a variant [0.2].", \
                    type = float, \
                    default = 0.2)
parser.add_argument("-q","--qual", \
                    help="Minimum variant quality [1].", \
                    type = float, \
                    default = 1)
parser.add_argument("-p","--frequency_product", \
                    help="Minimum variant read frequency product. Calculated as \
                    the product of the variant read frequencies for all samples with \
                    at least one read supporting the variant. If a site has many samples \
                    with very few reads supporting the variant, this value will be very small [1E-50].", \
                    type = float, \
                    default = 1E-50)
parser.add_argument("-l","--hom_alt", \
                    help="Add to report sites where the majority of samples are homozygous \
                    for the alternate allele. Such sites are assumed to be triallelic and \
                    can in most cases be ignored.", \
                    action='store_true')
parser.add_argument("-s","--skip_indels", \
                    help="Add to skip indels (useful for pileup data).", \
                    action='store_true')
parser.add_argument("-b","--background", \
                    help="Add to quit after filtering out background variants.", \
                    action='store_true')
parser.add_argument("-t","--test_binomial", \
                    help="Add to test the reads for a binomial distribution. \
                    Only reads from the sample(s) containing the variant are considered.", \
                    action='store_true')
parser.add_argument("-v","--version", \
                    help="Print version and quit.", \
                    action = "version", \
                    version = "call_mutations v.{}".format(__version__))
args = parser.parse_args()

class Genotype:
    ''' Describe the genotype of each sample on the reference and variant read coverage
    '''
    def __init__(self, sample, ref_reads, var_reads, cov, var_freq):
        self.sample = sample
        self.ref_reads = ref_reads
        self.var_reads = var_reads
        self.cov = cov
        self.var_freq = var_freq

def getOut():
    '''
    To create a prefix for output files.
    '''
    if args.output:
        outprefix = args.output
    else:
        outprefix = "potential_mutations"
    return outprefix

def readCov():
    ''' Read the median coverage for each sample in the dataset from a separate
        file.
    '''
    median_covs = {}
    with open(args.coverage_file, "r", encoding="utf-8") as cov:
        for line in cov:
            line = line.strip()
            if line.startswith("#"):
                continue
            sample = line.split("\t")[0]
            coverage = float(line.split("\t")[1])
            median_covs[sample] = int(coverage)
    return median_covs

def test_signif(var,cov):
    '''
    Tests for significant skew of reads towards one variant
    Allows one error per 100 000 variants
    True means site passed check (= is not significantly different
    from a 0.5 frequency)
    '''
    alpha = 0.00001
    p = ss.binom_test(var,cov,1.0/2,alternative="two-sided")
    if p < alpha:
        return False
    else:
        return True

def main():
    outfile = getOut() + ".table"
    outlines = []

    # If user gives a file with median coverage values (-c),
    # read this file.
    if args.coverage_file:
        median_covs = readCov()

    with open(args.input, "r", encoding="utf-8") as input:
        for line in input:
            line = line.strip()
            fields = line.split("\t")
            fields = list(filter(None, fields))

            # If the first field starts with CHROM we're at the beginning of the file
            if fields[0] == "CHROM": # Define fields where we have samples.
                samples = fields[6:]
                genotype_positions = [i for i in range(6, 6+len(samples))]
                outlines.append("\t".join(fields[0:6])+"\t"+"\t".join(samples))
                continue

            # If skipping indels
            if args.skip_indels == True and len(fields[3]) != len(fields[4]):
                continue

            # Skip lines with missing data.
            if "." in "".join(fields[6:]) or "NA" in "".join(fields[6:]):
                continue

            qual = float(fields[5])
            pos = int(fields[1])
            tig = fields[0]

            # Skip lines where args.qual is too low.
            if qual < args.qual:
                continue

            # Collect data into a list of Genotypes
            reads_per_sample = fields[min(genotype_positions):max(genotype_positions) + 1] # It's still a list of strings
            reads_per_sample = [ (int(i.split(",")[0]), int(i.split(",")[1])) \
                                for i in reads_per_sample ] # Convert to list of tuples
            genotypes = []
            variant_frequencies = []
            for sample,gt in zip(samples,reads_per_sample):
                ref_reads,variant_reads = gt[0],gt[1]
                cov = variant_reads+ref_reads
                freq = variant_reads/cov if cov > 0 else 0
                genotypes.append(Genotype(sample, ref_reads,variant_reads, cov, freq))
                variant_frequencies.append(freq)

            # If all frequencies are the same, continue. This is the case if
            # all samples are homozygous for either allele.
            if all(x == variant_frequencies[0] for x in variant_frequencies):
                continue

            # Generally, we are interested in sites where most
            # samples are homozygous for the reference allele: freq = 0.0
            # Unless the user gave --hom_alt, report only such sites.
            if args.hom_alt == False and 1.0 in variant_frequencies:
                continue

            # Look for homozygous sites.  Require also more than args.minimum_reads
            # of the minor allele in at least one sample.
            if (1.0 in variant_frequencies or 0.0 in variant_frequencies) and \
            max([x.ref_reads for x in genotypes]) >= args.minimum_reads and \
            max([x.var_reads for x in genotypes]) >= args.minimum_reads:

                # If only filtering out background variants, skip remaining filters
                # and write out this variant.
                if args.background == True:
                    outlines.append(line)
                    continue

                # Otherwise, continue filtering
                homoz_cov_checker, heteroz_cov_checker = False, False
                indv_cov_checker, freq_checker = True, False
                tot_ref, tot_alt, totcov = 0,0,0
                freq_prod = 1

                for gt in genotypes:

                    # Look for heterozygous sites
                    if gt.var_freq != 1.0 and gt.var_freq != 0.0:

                        # Add number of variant reads to total variant
                        # allele counter
                        totcov += gt.cov
                        tot_alt += gt.var_reads
                        tot_ref += gt.ref_reads

                        # Keep a product of the frequency of the
                        # variant allele. This value will be very
                        # low if there is a bias for many samples to
                        # contain a low number of reads supporting
                        # the variant.
                        freq_prod *= gt.var_freq

                        # Check that at least one heterozygous site has
                        # allele frequency within the boundaries
                        if gt.var_freq > args.frequency_cutoff \
                        and gt.var_freq < 1 - args.frequency_cutoff:
                            freq_checker = True

                        # Check that at least one heterozygous site has
                        # sufficient coverage to consider
                        if int(gt.cov) >= args.minimum_coverage \
                        and int(gt.cov) <= args.maximum_coverage:
                            heteroz_cov_checker = True

                        # If user gives a file with median coverage values (-c),
                        # we check that the coverage for all heterozygous
                        # samples is within the defined boundaries (-a)
                        if args.coverage_file:
                            median = median_covs[gt.sample]
                            # Check if coverage is outside the boundaries
                            # if so we skip this site by turning the
                            # checker to false
                            if gt.cov > median + median * args.coverage_factor \
                            or gt.cov < median - median * args.coverage_factor:
                                indv_cov_checker = False
                                break

                    # Check that at least one homozygous site has
                    # sufficient coverage to consider
                    elif gt.var_freq == 0 or gt.var_freq == 1:
                        if int(gt.cov) >= args.minimum_coverage \
                        and int(gt.cov) <= args.maximum_coverage:
                            homoz_cov_checker = True

                # If checkers are true, continue to check minor_allele
                # read frequency
                if heteroz_cov_checker and freq_checker \
                and indv_cov_checker and homoz_cov_checker:

                    # False positives often have several samples with a
                    # low frequency of the variant in the reads.
                    # Real variants, not caused by sporadic read mapping,
                    # should follow a binomial distribution of frequency
                    # in the reads, assuming an equal nuclear ratio in the sample
                    # This is solved by calculating a p-value for the
                    # read frequency across all samples where the
                    # variant is found. Samples with a single read
                    # containing the variant are ignored as this could be
                    # the result of sequencing error or index hopping
                    tot_minor_allele = tot_alt if tot_alt < tot_ref else tot_ref

                    # Here we also check for the frequency product.
                    if tot_minor_allele > 1 and freq_prod > args.frequency_product:
                        # import ipdb; ipdb.set_trace()
                        # If we expect a 50/50 ratio we can do a binomial test
                        if args.test_binomial:
                            if test_signif(tot_minor_allele, totcov):
                                outlines.append(line)
                        else:
                            outlines.append(line)

        with open(outfile, "w") as out:
            out.write("\n".join(outlines))

if __name__ == "__main__":
    main()