-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgene_extract.cpp
More file actions
208 lines (177 loc) · 6.73 KB
/
gene_extract.cpp
File metadata and controls
208 lines (177 loc) · 6.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#include "functions.cuh"
#include "gene_extract.h"
gene_extract::gene_extract(string gene_List, string reference_Path, string output_Path, string intermediate_Path)
{
/**
* * Constructor Function
* Assigns passed variables to the classes' private variable.
**/
cout << "Starting up Gene extractor" << endl;
this->gene_List = gene_List;
this->reference_Path = reference_Path;
this->output_Path = output_Path;
this->intermediate_Path = intermediate_Path;
}
void gene_extract::ingress()
{
/**
* Execution function.
**/
/**
* Call the "functions" class. Bespoke functions commonly used by CATE.
**/
functions function = functions();
/**
* Load the reference file to the RAM.
* This is a double edged sword.
* Immensely increases the speed of finding and writing the genes.
* But,
* Requires RAM to be large at least to match the size of the reference sequence.
* Does take time to load the reference sequence to the RAM.
**/
fstream reference;
reference.open(this->reference_Path, ios::in);
string full_Reference = "";
if (reference.is_open())
{
cout << endl
<< "Loading reference file: " << this->reference_Path << endl;
/**
* @param line captures the sequences line read from the FASTA file.
**/
string line;
/**
* Skip the first line of the FASTA sequence.
* This is the ID of the sequence (begins with a ">" symbol).
**/
getline(reference, line);
while (getline(reference, line))
{
full_Reference.append(line);
}
reference.close();
}
/**
* REFERENCE sequence will be converted to UPPERCASE.
* Helps create uniformity.
**/
transform(full_Reference.begin(), full_Reference.end(), full_Reference.begin(), ::toupper);
cout << "Reference file has been loaded" << endl
<< endl;
// cout << 10505 << "\t" << full_Reference.at(10505 - 1) << endl;
// cout << 10506 << "\t" << full_Reference.at(10506 - 1) << endl;
/**
* Initiate the reading of the gene file.
**/
fstream gene_File;
gene_File.open(gene_List, ios::in);
/**
* Log file created in the intermediate folder.
* ! This helps with the resume function. Automatically resumes from the last completely processed gene in the event of a program crash.
**/
string intermediate_File = intermediate_Path + "/" + filesystem::path(gene_List).stem().string() + ".log_ex";
cout << "Extracting genes to: " << this->output_Path << endl
<< endl;
if (gene_File.is_open())
{
/**
* @param gene_Combo used to capture and extract info of each gene combination.
**/
string gene_Combo;
/**
* If the intermediate log file is absent this run will be considered as a brand new run of this query and,
* the intermediate log file will be created.
**/
if (filesystem::exists(intermediate_File) == 0)
{
function.createFile(intermediate_File);
}
else
{
/**
* If the intermediate log file present then the resume process will initiated.
* This is a unintelligent resume. Essentially it matches the each read line written with the lines read from the gene file.
* The break will occur as soon as their is a mismatch.
* To counter any errors it is advised to have a new gene file name or a new intermediate folder per new run.
**/
fstream intermediate;
intermediate.open(intermediate_File, ios::in);
/**
* @param get_finished comparison variable. Used o compare the intermediate file data with that of the gene file.
**/
string get_finished;
while (getline(intermediate, get_finished))
{
getline(gene_File, gene_Combo);
if (gene_Combo != get_finished)
{
break;
}
}
intermediate.close();
}
fstream intermediate;
intermediate.open(intermediate_File, ios::app);
while (getline(gene_File, gene_Combo))
{
/**
* @param split_Data vector captures split function's outputs on the genes information.
**/
vector<string> split_Data;
function.split(split_Data, gene_Combo, '\t');
/**
* @param gene_Name captures the gene's name.
**/
string gene_Name = split_Data[0];
cout << "Gene name\t: " << gene_Name << endl;
/**
* @param coordinates vector captures split function's outputs on gene coordinates.
* [0] = chromosome
* [1] = start position
* [2] = end position
**/
vector<string> coordinates;
function.split(coordinates, split_Data[1], ':');
/**
* @param start_Co captures query gene's start position as an integer.
**/
int start_Co = stoi(coordinates[1]);
/**
* @param end_Co captures query gene's end position as an integer.
**/
int end_Co = stoi(coordinates[2]);
cout << "Coordinates\t: Chromosome: " << coordinates[0] << " Start: " << start_Co << " End: " << end_Co << endl
<< endl;
/**
* @param output_File defines the output file for the query gene sequence.
**/
string output_File = output_Path + "/" + gene_Name + ".fasta";
/**
* If the file already exists it will be deleted without prejudice.
* So be advised about this.
**/
if (filesystem::exists(output_File) != 0)
{
cout << "Deleting existing file at: " << output_File << endl;
filesystem::remove(output_File);
}
/**
* Writing of the sequence to the FILE. The sequence ID will be a combination of the gene's name and coordinates.
**/
cout << "Writing gene to file: " << output_File << endl;
function.createFile(output_File, ">" + gene_Name + ":" + split_Data[1]);
fstream output;
output.open(output_File, ios::app);
for (int i = start_Co - 1; i < end_Co; i++)
{
output << full_Reference.at(i);
}
output.flush();
intermediate << gene_Combo << "\n";
output.close();
cout << endl;
}
intermediate.close();
gene_File.close();
}
}