forked from GreshamLab/RNAseq
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape_adapter.py
More file actions
126 lines (106 loc) · 3.81 KB
/
scrape_adapter.py
File metadata and controls
126 lines (106 loc) · 3.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 17 13:11:16 2017
The purpose of this script is to scrape the fastq file and enter the sequencer
defined adapter to a list. We then use this adapter list to generate a '.fa'
file that cutadapt can use to perform trimming.
@author: pieter
"""
import sys
import subprocess
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
#set defaults
paired_arg = '-u'
adapter_list=[]
#collect arguments
if (sys.argv[1] == '-h') or (sys.argv[1] == '-help'):
print('scrape_adapter is designed to be used with fastq files.\nThese can be either in uncompressed of gzipped (".gz") formats.')
print('Invoke as:\n\tpython scrape_adapter.py -u [-p for paired end] -i fastq_file_name -o output_file_name')
print('')
exit()
for arg in range(len(sys.argv)-1):
if sys.argv[arg] == '-p':
paired_arg = '-p'
if sys.argv[arg] == '-i':
infile_name = sys.argv[int(arg+1)]
if sys.argv[arg] == '-o':
outfile_name = sys.argv[int(arg+1)]
#test to see if input is gzipped
#if it is use zcat to make temp uncompressed file
if infile_name.rsplit('.',1)[1]=='gz':
is_gzipped=True
else:
is_gzipped=False
# process gzipped file then open temp
if is_gzipped:
bashCommand = ('zcat %s > temp.fastq')%(infile_name)
output = subprocess.check_output([bashCommand],stderr=subprocess.STDOUT,shell=True)
infile_o = ('temp.fastq')
#open uncompressed file
if not is_gzipped:
infile_o = (infile_name)
# parse file keeping only highest frequency adapter
def scrape_file(infile_o):
infile = open(infile_o)
adapter_dict = {}
high_score=0
winner=''
#parse file
for line in infile:
if line[0]=='@':
line = line.strip()
adapter = line.rsplit(':',1)[1]
if ('N' not in adapter):
if (adapter in adapter_dict):
adapter_dict[adapter]+=1
if (adapter not in adapter_dict):
adapter_dict[adapter]=1
print('Scraped the following adapters these many times:')
print(adapter_dict)
for key, value in adapter_dict.items():
if value > high_score:
high_score=value
winner=key
adapter_list.append(winner)
infile.close()
return(adapter_list)
#write out adapter
def write_outadapter(adapter_list,runmode):
if runmode =='fwd':
count = 0
print('Using Adapter' +str(adapter_list))
for adapter in adapter_list:
revadapter = Seq(adapter, IUPAC.unambiguous_dna)
adapter = revadapter.reverse_complement()
outline=('>FwdAdapter_%s\na%sagatcggaagagcacacgtctgaactccagtcacNNNNNNatctcgtatgccgtcttctgcttg\n')%(count,adapter)
print('Trimming with:')
print(outline)
count+=1
outfile.write(outline)
if runmode =='rev':
count = 0
print('Using Adapter' +str(adapter_list))
for adapter in adapter_list:
adapter = Seq(adapter, IUPAC.unambiguous_dna)
#adapter = revadapter.reverse_complement()
outline=('>RevAdapter_%s\naatgatacggcgaccaccgagatctacactctttccctacacgacgctcttccgatct%sa\n')%(count,adapter)
print('Trimming with:')
print(outline)
count+=1
outfile.write(outline)
#actual command statements
if paired_arg=='-u':
adapter_list = scrape_file(infile_o)
if adapter_list:
outfile = open(outfile_name,'w')
write_outadapter(adapter_list,'fwd')
if paired_arg=='-p':
adapter_list = scrape_file(infile_o)
if adapter_list:
outfile = open(outfile_name,'w')
write_outadapter(adapter_list,'rev')
#clean up
outfile.close()
if is_gzipped:
output = subprocess.check_output(['rm temp.fastq'],stderr=subprocess.STDOUT,shell=True)