SMDB-scripts/pileup.py at master · AbateLab/SMDB-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 17 12:18:45 2014

pileup reader - for reading pileups

@author: Freeman
"""

data = {"*":{}, "A":{}, "B":{}, "C":{}, "D":{}, "E":{}, "F":{}, "G":{}, "H":{}, "I":{}, "J":{}, "L":{} }
#template is key, dictionary of {position:[bases]}

def input_pileup (path, limit = 0):
    infile = open(path, "r")
    for line in infile:
        lines = line.split()    #lines[3] is the number for coverage
        data[lines[0]][lines[1]] = list(lines[4])        #list of covering bases append to the correct position
        limit -= 1
        if limit == 1:
            break
    return

def input_ref(infile):
    reference = {}
    for line in infile:
        if line[0] == ">":
            name = line[1:].strip()
            seq = infile.next().strip()
            reference[name] = seq
    return reference

def output_coverages_ends (outpath):
    for key in data:
        with open("%s%scov.txt"%(outpath,key), "w") as outfile:
            for p in data[key]:
                outfile.write("%s\t%d\n"%(p,len(data[key][p])))
    return

def percent_cov(): #prints out the of bases that the assemblies covered of the reference
    lengths = {"A":0, "B":0, "C":5227, "D":5001, "E":5002, "F":0, "G":3185, "H":4774, "I":4876, "J":3920, "L":4361}
    cov = 0
    total = 0
    for key in data:
        if key == "*":
            continue
        for pos in data[key]:
            cov += 1
    for key in lengths:
        total += lengths[key]
    print "%f fraction of references covered by these contigs is"%(1.*cov/total)
    return

def GC_pos(ref, binsize, outfile): #prints out the GC percentage as function of positions
        n = 0
        bases = []
        for i in range(len(ref)+1):
            n += 1
            try:
                bases.append(ref[i-1])
            except:
                print "error with reference file!"
            if n >= binsize:  #once bin size has reached, write it all out and restart
                outfile.write("%d\t%f\n"% (i+n/2, GCcontent(bases)))
                bases = []
                n = 0
        return

def GCcontent (seq): #returns the GC content of the input sequence - internal function
    GC = 0
    for c in seq:
        if c == "G" or c == "C" or c == "g" or c == "c":
            GC += 1
    return 1.*GC/len(seq)

def consensus(): #outputs a dictionary of {pos: consensus} and an associated
#confidence score which is fraction consensus base/ total # bases
    consensus = {} #consensus = {pos:(base, score)}
    for key in data:
        for pos in data[key]:
            SNPs = {"a":0, "c":0,"g":0,"t":0}
            for base in data[key][pos]:
                SNPs[base] += 1
            call = keywithmaxval(SNPs)
            score = 1.*SNPs[call]/len(data[key][pos])
            print (call, score)
    return

def keywithmaxval(d):
     """ a) create a list of the dict's keys and values;
         b) return the key with the max value"""
     v=list(d.values())
     k=list(d.keys())
     return k[v.index(max(v))]


if __name__ == "__main__":
    path = "./"
    #input_pileup("%sbowtiecontigs.pileup" %path, limit = 0)
    with open("%s8templates.fasta"%path, "r") as infile:
        references = input_ref(infile)
    print "all data imported"

    for key in references:
        with open("%sGC_pos%s.txt"%(path, key), "w") as outfile:
            GC_pos(references[key], 100, outfile)

    #consensus()
    #percent_cov()
    #output_coverages_ends(path)
    print "done"