Skip to content

Commit 968ffb4

Browse files
optimization
1 parent f1d64f6 commit 968ffb4

11 files changed

Lines changed: 243 additions & 163 deletions

File tree

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Multiple Sequence Alignment with Suffix Tree
22

3-
This project is aimed at improving a part of [HAlign2.0](https://github.com/malabz/HAlign), which could align multiple nucleotide sequences fast and accurately.
3+
This project is aimed at improving a part of [HAlign](https://github.com/malabz/HAlign), which could align multiple nucleotide sequences fast and accurately.
44

55
## Usage
66

@@ -10,6 +10,9 @@ stmsa-x.x.x-win-x64.exe unaligned.fasta output.fasta
1010

1111
## Change Log
1212

13+
* 2021-07-15
14+
further decrease the space requirement
15+
1316
* 2021-04-25
1417

1518
self-defined allocator for suffix tree, which reduce the construction time to about 1/3 comparing with ::operator new

src/StarAlignment/StarAligner.cpp

Lines changed: 46 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,17 @@
44
#include "../Utils/Graph.hpp"
55
#include "../PairwiseAlignment/NeedlemanWunshReusable.hpp"
66

7-
std::vector<std::vector<unsigned char>> star_alignment::StarAligner::align(const std::vector<sequence_type>& sequences)
7+
std::vector<std::vector<unsigned char>> star_alignment::StarAligner::align(const std::vector<sequence_type> &sequences)
88
{
99
return StarAligner(sequences)._align();
1010
}
1111

12-
star_alignment::StarAligner::StarAligner(const std::vector<sequence_type>& sequences):
12+
auto star_alignment::StarAligner::get_gaps(const std::vector<sequence_type> &sequences) -> std::vector<std::vector<utils::Insertion>>
13+
{
14+
return StarAligner(sequences)._get_gaps();
15+
}
16+
17+
star_alignment::StarAligner::StarAligner(const std::vector<sequence_type> &sequences):
1318
_sequences(sequences),
1419
_row(_sequences.size()),
1520
_lengths(_set_lengths()),
@@ -41,12 +46,18 @@ std::vector<std::vector<unsigned char>> star_alignment::StarAligner::_align() co
4146
return _insert_gaps(_merge_results(_pairwise_align()));
4247
}
4348

44-
auto star_alignment::StarAligner::_pairwise_align() const -> std::vector<std::array<std::vector<indel>, 2>>
49+
auto star_alignment::StarAligner::_get_gaps() const
50+
-> std::vector<std::vector<utils::Insertion>>
51+
{
52+
return _merge_results(_pairwise_align());
53+
}
54+
55+
auto star_alignment::StarAligner::_pairwise_align() const -> std::vector<std::array<std::vector<utils::Insertion>, 2>>
4556
{
4657
static constexpr size_t threshold = 15;
4758

4859
suffix_tree::SuffixTree<nucleic_acid_pseudo::NUMBER> st(_centre.cbegin(), _centre.cend(), nucleic_acid_pseudo::GAP);
49-
std::vector<std::array<std::vector<indel>, 2>> all_pairwise_gaps;
60+
std::vector<std::array<std::vector<utils::Insertion>, 2>> all_pairwise_gaps;
5061

5162
using iter = std::vector<unsigned char>::const_iterator;
5263
pairwise_alignment::NeedlemanWunshReusable<iter, iter, decltype(pairwise_alignment::default_scoring_matrix)>
@@ -86,7 +97,7 @@ auto star_alignment::StarAligner::_pairwise_align() const -> std::vector<std::ar
8697
}));
8798
}
8899

89-
std::array<std::vector<indel>, 2> pairwise_gaps;
100+
std::array<std::vector<utils::Insertion>, 2> pairwise_gaps;
90101
for (size_t j = 0; j != intervals.size(); ++j)
91102
{
92103
const size_t centre_begin = intervals[j][0];
@@ -101,8 +112,8 @@ auto star_alignment::StarAligner::_pairwise_align() const -> std::vector<std::ar
101112
auto [lhs_gaps, rhs_gaps] = nw(_centre.cbegin() + centre_begin, _centre.cbegin() + centre_end,
102113
_sequences[i].cbegin() + sequence_begin, _sequences[i].cbegin() + sequence_end, flag);
103114

104-
_converse(lhs_gaps, pairwise_gaps[0], centre_begin);
105-
_converse(rhs_gaps, pairwise_gaps[1], sequence_begin);
115+
_append(lhs_gaps, pairwise_gaps[0], centre_begin);
116+
_append(rhs_gaps, pairwise_gaps[1], sequence_begin);
106117
}
107118

108119
size_t sequence_gap_num = 0;
@@ -114,7 +125,7 @@ auto star_alignment::StarAligner::_pairwise_align() const -> std::vector<std::ar
114125
return all_pairwise_gaps;
115126
}
116127

117-
auto star_alignment::StarAligner::_optimal_path(const std::vector<triple>& identical_substrings)
128+
auto star_alignment::StarAligner::_optimal_path(const std::vector<triple> &identical_substrings)
118129
-> std::vector<triple>
119130
{
120131
std::vector<triple> optimal_identical_substrings;
@@ -164,25 +175,25 @@ auto star_alignment::StarAligner::_optimal_path(const std::vector<triple>& ident
164175
return optimal_identical_substrings;
165176
}
166177

167-
void star_alignment::StarAligner::_converse(const std::vector<size_t>& src_gaps, std::vector<indel>& des_gaps, size_t start)
178+
void star_alignment::StarAligner::_append(const std::vector<size_t> &src_gaps, std::vector<utils::Insertion> &des_gaps, size_t start)
168179
{
169180
for (size_t i = 0; i != src_gaps.size(); ++i)
170181
if (src_gaps[i])
171182
{
172183
if (des_gaps.size() && des_gaps.back().index == start + i)
173184
des_gaps.back().number += src_gaps[i];
174185
else
175-
des_gaps.push_back(indel({ start + i, src_gaps[i] }));
186+
des_gaps.push_back(utils::Insertion({ start + i, src_gaps[i] }));
176187
}
177188
}
178189

179-
auto star_alignment::StarAligner::_merge_results(const std::vector<std::array<std::vector<indel>, 2>>& pairwise_gaps) const
180-
-> std::vector<std::vector<indel>>
190+
auto star_alignment::StarAligner::_merge_results(const std::vector<std::array<std::vector<utils::Insertion>, 2>> &pairwise_gaps) const
191+
-> std::vector<std::vector<utils::Insertion>>
181192
{
182-
std::vector<indel> final_centre_gaps;
193+
std::vector<utils::Insertion> final_centre_gaps;
183194
for (size_t i = 0; i != _row; ++i)
184195
{
185-
const auto& curr_centre_gaps = pairwise_gaps[i][0];
196+
const auto &curr_centre_gaps = pairwise_gaps[i][0];
186197
for (size_t lhs_pointer = 0, rhs_pointer = 0; rhs_pointer != curr_centre_gaps.size(); )
187198
{
188199
if (lhs_pointer == final_centre_gaps.size())
@@ -211,16 +222,20 @@ auto star_alignment::StarAligner::_merge_results(const std::vector<std::array<st
211222
}
212223
}
213224

214-
std::vector<std::vector<indel>> final_sequence_gaps;
225+
std::vector<std::vector<utils::Insertion>> final_sequence_gaps;
215226
final_sequence_gaps.reserve(_row);
216227
for (size_t i = 0; i != _row; ++i)
217228
{
218-
const auto& curr_centre_gaps = pairwise_gaps[i][0];
219-
const auto& curr_sequence_gaps = pairwise_gaps[i][1];
229+
const auto &curr_centre_gaps = pairwise_gaps[i][0];
230+
const auto &curr_sequence_gaps = pairwise_gaps[i][1];
220231

221-
std::vector<indel> centre_addition = _minus(final_centre_gaps, curr_centre_gaps);
232+
std::vector<utils::Insertion> centre_addition;
233+
centre_addition.reserve(final_centre_gaps.size());
234+
utils::Insertion::minus(final_centre_gaps.cbegin(), final_centre_gaps.cend(),
235+
curr_centre_gaps.cbegin(), curr_centre_gaps.cend(),
236+
std::back_inserter(centre_addition));
222237

223-
std::vector<indel> sequence_addition;
238+
std::vector<utils::Insertion> sequence_addition;
224239
for (size_t centre_index = 0, sequence_index = 0, centre_gaps_index = 0, sequence_gaps_index = 0, centre_addition_index = 0;
225240
centre_addition_index != centre_addition.size(); ++centre_addition_index)
226241
{
@@ -247,66 +262,21 @@ auto star_alignment::StarAligner::_merge_results(const std::vector<std::array<st
247262
if (sequence_addition.size() && sequence_index == sequence_addition.back().index)
248263
sequence_addition.back().number += curr_addition.number;
249264
else
250-
sequence_addition.push_back(indel({ sequence_index, curr_addition.number }));
265+
sequence_addition.push_back(utils::Insertion({ sequence_index, curr_addition.number }));
251266
}
252267

253-
final_sequence_gaps.push_back(_add(curr_sequence_gaps, sequence_addition));
268+
std::vector<utils::Insertion> indels_of_current_sequence;
269+
indels_of_current_sequence.reserve(curr_sequence_gaps.size() + sequence_addition.size());
270+
utils::Insertion::plus(curr_sequence_gaps.cbegin(), curr_sequence_gaps.cend(),
271+
sequence_addition.cbegin(), sequence_addition.cend(),
272+
std::back_inserter(indels_of_current_sequence));
273+
final_sequence_gaps.push_back(indels_of_current_sequence);
254274
}
255275

256276
return final_sequence_gaps;
257277
}
258278

259-
auto star_alignment::StarAligner::_add(const std::vector<indel>& lhs, const std::vector<indel>& rhs)
260-
-> std::vector<indel>
261-
{
262-
std::vector<indel> sum;
263-
264-
size_t lhs_index = 0, rhs_index = 0;
265-
for (; lhs_index != lhs.size() && rhs_index != rhs.size(); )
266-
if (lhs[lhs_index].index < rhs[rhs_index].index)
267-
{
268-
sum.push_back(lhs[lhs_index]);
269-
++lhs_index;
270-
}
271-
else if (lhs[lhs_index].index > rhs[rhs_index].index)
272-
{
273-
sum.push_back(rhs[rhs_index]);
274-
++rhs_index;
275-
}
276-
else
277-
{
278-
sum.push_back(indel({ lhs[lhs_index].index, lhs[lhs_index].number + rhs[rhs_index].number }));
279-
++lhs_index;
280-
++rhs_index;
281-
}
282-
283-
sum.insert(sum.cend(), lhs.cbegin() + lhs_index, lhs.cend());
284-
sum.insert(sum.cend(), rhs.cbegin() + rhs_index, rhs.cend());
285-
return sum;
286-
}
287-
288-
// assume lhs >= rhs
289-
auto star_alignment::StarAligner::_minus(const std::vector<indel>& lhs, const std::vector<indel>& rhs)
290-
-> std::vector<indel>
291-
{
292-
std::vector<indel> difference;
293-
difference.reserve(lhs.size());
294-
295-
size_t lhs_index = 0;
296-
for (size_t rhs_index = 0; rhs_index != rhs.size(); ++lhs_index, ++rhs_index)
297-
{
298-
while (lhs[lhs_index].index != rhs[rhs_index].index)
299-
difference.push_back(lhs[lhs_index++]);
300-
301-
size_t distance = lhs[lhs_index].number - rhs[rhs_index].number;
302-
if (distance) difference.push_back(indel({ lhs[lhs_index].index, distance }));
303-
}
304-
305-
difference.insert(difference.cend(), lhs.cbegin() + lhs_index, lhs.cend());
306-
return difference;
307-
}
308-
309-
auto star_alignment::StarAligner::_insert_gaps(const std::vector<std::vector<indel>>& gaps) const
279+
auto star_alignment::StarAligner::_insert_gaps(const std::vector<std::vector<utils::Insertion>> &gaps) const
310280
-> std::vector<sequence_type>
311281
{
312282
size_t length = _lengths[0];
@@ -316,25 +286,10 @@ auto star_alignment::StarAligner::_insert_gaps(const std::vector<std::vector<ind
316286
aligned.reserve(_row);
317287

318288
for (size_t i = 0; i != _row; ++i)
319-
aligned.push_back(_insert_gaps(_sequences[i], gaps[i], length));
320-
return aligned;
321-
}
322-
323-
auto star_alignment::StarAligner::_insert_gaps(const sequence_type& sequence, const std::vector<indel>& gaps, size_t length)
324-
-> sequence_type
325-
{
326-
sequence_type gapped;
327-
gapped.reserve(length);
328-
329-
size_t sequence_index = 0;
330-
for (const auto gap : gaps)
331289
{
332-
const size_t new_sequence_index = sequence_index + (gap.index - sequence_index);
333-
gapped.insert(gapped.cend(), sequence.cbegin() + sequence_index, sequence.cbegin() + new_sequence_index);
334-
gapped.insert(gapped.cend(), gap.number, nucleic_acid_pseudo::GAP);
335-
336-
sequence_index = new_sequence_index;
290+
aligned.emplace_back(length);
291+
utils::Insertion::insert_gaps(_sequences[i].cbegin(), _sequences[i].cend(),
292+
gaps[i].cbegin(), gaps[i].cend(), aligned.back().begin(), nucleic_acid_pseudo::GAP);
337293
}
338-
gapped.insert(gapped.cend(), sequence.cbegin() + sequence_index, sequence.cend());
339-
return gapped;
294+
return aligned;
340295
}

src/StarAlignment/StarAligner.hpp

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#pragma once
22

33
#include "../SuffixTree/SuffixTree.hpp"
4+
#include "../Utils/Utils.hpp"
45

56
#include <vector>
67
#include <array>
@@ -16,37 +17,29 @@ namespace star_alignment
1617
using sequence_type = std::vector<unsigned char>;
1718

1819
public:
19-
static std::vector<sequence_type> align(const std::vector<sequence_type>& sequences);
20-
static std::vector<triple> _optimal_path(const std::vector<triple>& identical_substrings);
20+
static std::vector<sequence_type> align(const std::vector<sequence_type> &sequences);
21+
static std::vector<std::vector<utils::Insertion>> get_gaps(const std::vector<sequence_type> &sequences);
2122

22-
private:
23-
struct indel
24-
{
25-
size_t index;
26-
size_t number;
27-
28-
bool operator==(const indel& rhs) { return index == rhs.index && number == rhs.number; }
29-
};
23+
static std::vector<triple> _optimal_path(const std::vector<triple> &identical_substrings);
3024

31-
StarAligner(const std::vector<sequence_type>& sequences);
25+
private:
26+
StarAligner(const std::vector<sequence_type> &sequences);
3227

3328
std::vector<sequence_type> _align() const;
29+
std::vector<std::vector<utils::Insertion>> _get_gaps() const;
3430

3531
std::vector<size_t> _set_lengths() const;
3632
sequence_type _set_centre() const;
3733

3834
// main steps
39-
std::vector<std::array<std::vector<indel>, 2>> _pairwise_align() const;
40-
std::vector<std::vector<indel>> _merge_results(const std::vector<std::array<std::vector<indel>, 2>>& pairwise_gaps) const;
41-
std::vector<sequence_type> _insert_gaps(const std::vector<std::vector<indel>>& gaps) const;
35+
std::vector<std::array<std::vector<utils::Insertion>, 2>> _pairwise_align() const;
36+
std::vector<std::vector<utils::Insertion>> _merge_results(const std::vector<std::array<std::vector<utils::Insertion>, 2>> &pairwise_gaps) const;
37+
std::vector<sequence_type> _insert_gaps(const std::vector<std::vector<utils::Insertion>> &gaps) const;
4238

4339
// support
44-
static void _converse(const std::vector<size_t>& src_gaps, std::vector<indel>& des_gaps, size_t start);
45-
static std::vector<indel> _add(const std::vector<indel>& lhs, const std::vector<indel>& rhs);
46-
static std::vector<indel> _minus(const std::vector<indel>& lhs, const std::vector<indel>& rhs);
47-
static sequence_type _insert_gaps(const sequence_type& sequence, const std::vector<indel>& gaps, size_t length);
40+
static void _append(const std::vector<size_t> &src_gaps, std::vector<utils::Insertion> &des_gaps, size_t start);
4841

49-
const std::vector<sequence_type>& _sequences;
42+
const std::vector<sequence_type> &_sequences;
5043
const size_t _row;
5144
const std::vector<size_t> _lengths;
5245

src/SuffixTree/LeftChildRightSiblingBinaryTree.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ void suffix_tree::experiment(const char *in_file, const char *out_file)
163163
exit(0);
164164
}
165165

166-
const auto [identifications, sequences] = utils::read_to_pseudo(ifs);
166+
const auto sequences = utils::read_to_pseudo(ifs);
167167
std::cout << sequences.size() << " sequences found\n";
168168
if (sequences.size() < 2) return;
169169

src/Utils/Fasta.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,29 @@ void utils::Fasta::write_to(std::ostream& os, bool with_identification) const
1212

1313
void utils::Fasta::_read(std::istream& is)
1414
{
15-
std::string line, current;
15+
std::string each_line, each_sequence;
1616

17-
while (std::getline(is, line))
18-
if (line.size() && line[0] == '>')
17+
while (std::getline(is, each_line))
18+
if (each_line.size() && each_line[0] == '>')
1919
{
20-
identifications.push_back(line.substr(1));
20+
identifications.push_back(each_line.substr(1));
2121
break;
2222
}
2323

24-
while (std::getline(is, line))
24+
while (std::getline(is, each_line))
2525
{
26-
if (line.size() == 0) continue;
26+
if (each_line.size() == 0) continue;
2727

28-
if (line[0] == '>')
28+
if (each_line[0] == '>')
2929
{
30-
identifications.push_back(line.substr(1));
31-
sequences.push_back(std::move(current));
30+
identifications.push_back(each_line.substr(1));
31+
sequences.push_back(std::move(each_sequence));
3232
}
3333
else
3434
{
35-
current += line;
35+
each_sequence += each_line;
3636
}
3737
}
3838

39-
sequences.push_back(current);
39+
sequences.push_back(each_sequence);
4040
}

src/Utils/Fasta.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,10 @@ namespace utils
4444

4545
for (difference_type i = 0; i != len - 1; ++sequence_first, ++identification_first, ++i)
4646
{
47-
os << '<' << *identification_first << '\n';
47+
os << '>' << *identification_first << '\n';
4848
os << *sequence_first << '\n';
4949
}
50-
os << '<' << *identification_first << '\n';
50+
os << '>' << *identification_first << '\n';
5151
os << *sequence_first;
5252
}
5353

src/Utils/Insertion.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#include "Insertion.hpp"
2+
3+
bool utils::Insertion::operator==(const Insertion &rhs) const noexcept
4+
{
5+
return index == rhs.index && number == rhs.number;
6+
}

0 commit comments

Comments
 (0)