RNAseq/scrape_adapter.py at master · pspealman/RNAseq · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 17 13:11:16 2017

The purpose of this script is to scrape the fastq file and enter the sequencer
defined adapter to a list. We then use this adapter list to generate a '.fa'
file that cutadapt can use to perform trimming.

@author: pieter
"""
import sys
import subprocess
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

#set defaults
paired_arg = '-u'
adapter_list=[]

#collect arguments
if (sys.argv[1] == '-h') or (sys.argv[1] == '-help'):
    print('scrape_adapter is designed to be used with fastq files.\nThese can be either in uncompressed of gzipped (".gz") formats.')
    print('Invoke as:\n\tpython scrape_adapter.py -u [-p for paired end] -i fastq_file_name -o output_file_name')
    print('')
    exit()

for arg in range(len(sys.argv)-1):
    if sys.argv[arg] == '-p':
        paired_arg = '-p'
    if sys.argv[arg] == '-i':
        infile_name = sys.argv[int(arg+1)]
    if sys.argv[arg] == '-o':
        outfile_name = sys.argv[int(arg+1)]

#test to see if input is gzipped
#if it is use zcat to make temp uncompressed file
if infile_name.rsplit('.',1)[1]=='gz':
    is_gzipped=True
else:
    is_gzipped=False

# process gzipped file then open temp
if is_gzipped:
    bashCommand = ('zcat %s > temp.fastq')%(infile_name)
    output = subprocess.check_output([bashCommand],stderr=subprocess.STDOUT,shell=True)
    infile_o = ('temp.fastq')

#open uncompressed file
if not is_gzipped:
    infile_o = (infile_name)

# parse file keeping only highest frequency adapter
def scrape_file(infile_o):
    infile = open(infile_o)


    adapter_dict = {}
    high_score=0
    winner=''
    #parse file
    for line in infile:
        if line[0]=='@':
            line = line.strip()
            adapter = line.rsplit(':',1)[1]
            if ('N' not in adapter):
                if (adapter in adapter_dict):
                    adapter_dict[adapter]+=1
                if (adapter not in adapter_dict):
                    adapter_dict[adapter]=1
    print('Scraped the following adapters these many times:')
    print(adapter_dict)

    for key, value in adapter_dict.items():
        if value > high_score:
            high_score=value
            winner=key

    adapter_list.append(winner)
    infile.close()
    return(adapter_list)

#write out adapter
def write_outadapter(adapter_list,runmode):
    if runmode =='fwd':
        count = 0
        print('Using Adapter' +str(adapter_list))
        for adapter in adapter_list:
            revadapter = Seq(adapter, IUPAC.unambiguous_dna)
            adapter = revadapter.reverse_complement()
            outline=('>FwdAdapter_%s\na%sagatcggaagagcacacgtctgaactccagtcacNNNNNNatctcgtatgccgtcttctgcttg\n')%(count,adapter)
            print('Trimming with:')
            print(outline)
            count+=1
            outfile.write(outline)

    if runmode =='rev':
        count = 0
        print('Using Adapter' +str(adapter_list))
        for adapter in adapter_list:
            adapter = Seq(adapter, IUPAC.unambiguous_dna)
            #adapter = revadapter.reverse_complement()
            outline=('>RevAdapter_%s\naatgatacggcgaccaccgagatctacactctttccctacacgacgctcttccgatct%sa\n')%(count,adapter)
            print('Trimming with:')
            print(outline)
            count+=1
            outfile.write(outline)

#actual command statements
if paired_arg=='-u':
    adapter_list = scrape_file(infile_o)

    if adapter_list:
        outfile = open(outfile_name,'w')
        write_outadapter(adapter_list,'fwd')

if paired_arg=='-p':
    adapter_list = scrape_file(infile_o)

    if adapter_list:
        outfile = open(outfile_name,'w')
        write_outadapter(adapter_list,'rev')

#clean up
outfile.close()
if is_gzipped:
    output = subprocess.check_output(['rm temp.fastq'],stderr=subprocess.STDOUT,shell=True)