-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsortExternal.cpp
More file actions
236 lines (173 loc) · 6.14 KB
/
sortExternal.cpp
File metadata and controls
236 lines (173 loc) · 6.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
// Goal 1: Use 10 MB of memory to load the data and sort it.
// Goal 2: Merge all the data and produce a sorted file.
#include<iostream>
#include<fstream>
#include<vector>
#include<queue>
using namespace std;
int MAX_BUFF = 10000000;
const int NUM_ELEMENTS = MAX_BUFF/32;
float TOTAL_NUMBERS = 0;
float COMPLETED = 0;
ifstream inputStream;
void swap(vector<int> &vec, int left, int right) {
int temp = vec[left];
vec[left] = vec[right];
vec[right] = temp;
}
int partition(vector<int> &chunkVector, int start, int end) {
int pivot = chunkVector[start];
int lp = start;
for(int x = lp+1; x<=end; x++) {
if(chunkVector[x] < pivot) {
lp++;
swap(chunkVector, x, lp);
}
}
swap(chunkVector, start, lp);
return lp;
}
void quickSort(vector<int> &chunkVector, int start, int end) {
if(start < end) {
int pivot = partition(chunkVector, start, end);
quickSort(chunkVector, start, pivot-1);
quickSort(chunkVector, pivot+1, end);
}
}
void splitIntoChunks(string fileName, vector<string> &fileNames) {
inputStream.clear();
inputStream.open(fileName.c_str());
vector<string> chunkFileNames;
// TODO: Change to primitive int arr for performance
vector<int> chunkVector;
if(!inputStream) {
cout << "Error reading file " << fileName << endl;
}
int cur_buff = 0;
int cur_chunk = 0;
string chunk_prefix = fileName+"_chunk_";
string cur_chunk_file_name = chunk_prefix + to_string(cur_chunk);
// create an array which holds 32 bit int data
int num;
while(inputStream >> num) {
TOTAL_NUMBERS++;
chunkVector.push_back(num);
if(chunkVector.size() == NUM_ELEMENTS) {
// perform quicksort on chunkVector;
quickSort(chunkVector, 0, chunkVector.size()-1);
// write to cur_chunk_file_name
ofstream output_stream(cur_chunk_file_name);
output_stream.clear();
for(int val : chunkVector) {
output_stream << val << endl;
}
chunkVector.clear();
// add the current file_name to chunkFileNames
fileNames.push_back(cur_chunk_file_name);
// update cur_chunk && cur_chunk_file_name;
cur_chunk++;
cur_chunk_file_name = chunk_prefix + to_string(cur_chunk);
// update output_stream;
output_stream.close();
}
}
if(chunkVector.size()) {
quickSort(chunkVector, 0, chunkVector.size()-1);
ofstream output_stream(cur_chunk_file_name);
output_stream.clear();
for(int val : chunkVector) {
output_stream << val << endl;
}
chunkVector.clear();
output_stream.close();
fileNames.push_back(cur_chunk_file_name);
}
inputStream.close();
}
// Here we learn about some of the limits of our operating system:
// There is a scenario where we have a relatively large file ex: 1TB
// and we use a buffer of 1MB. We will split the data into 1000000 chunks (files).
// When we want to combine them, we read all the files in a k-merge algorithm and fill our buffer.
// There is a limit to how many files we can have open on my computer it is 98304 files.
// This can be found using `sysctl kern.maxfiles` (mac) and `ulimit -a` for ubuntu
// If you are working with a small RAM you will be able to open fewer files.
// WIP: Doesn't consider memory implications
struct FileStreamWithPriority
{
int priority;
ifstream* file;
};
struct CompareVal {
bool operator()(const FileStreamWithPriority* lhs, const FileStreamWithPriority* rhs) {
return lhs->priority > rhs->priority;
}
};
void combineChunks(vector<string> &fileNames) {
// initializing;
vector<ifstream*> chunkedFilePointers;
priority_queue <FileStreamWithPriority*, vector<FileStreamWithPriority*>, CompareVal > pq;
for(int i = 0; i<fileNames.size(); i++) {
ifstream *n = new ifstream;
n->open(fileNames[i]);
chunkedFilePointers.push_back(n);
}
ofstream out("output.txt");
int maxPerFile = NUM_ELEMENTS / chunkedFilePointers.size();
for(ifstream *ptr: chunkedFilePointers) {
int num;
int count = 0;
while(*ptr >> num && count < maxPerFile) {
FileStreamWithPriority *temp = new FileStreamWithPriority;
temp->file = ptr;
temp->priority = num;
pq.push(temp);
count++;
}
}
cout << " pq size is " << pq.size()*32 << " maxperfile is " << maxPerFile << " " << chunkedFilePointers.size() << endl;
while(pq.size()) {
FileStreamWithPriority *top = pq.top();
out << pq.top()->priority <<endl;
COMPLETED++;
float comp = (COMPLETED/TOTAL_NUMBERS)*100;
// cout << comp << " Percent Done" <<endl;
pq.pop();
int num;
if(*top->file >> num) {
top->priority = num;
pq.push(top);
}
}
for(string fileName: fileNames) {
remove(fileName.c_str());
}
}
int main() {
remove("output.txt");
vector<string> inputFiles;
cout << "Enter the file names that contain data : (type done)" << endl;
string file;
while(true) {
cin >> file;
if(file == "done") break;
inputFiles.push_back(file);
}
cout << "Sorting through the following files :" << endl;
for(string x: inputFiles) {
cout << x << endl;
}
// 1. sort each file into a new sorted file
// 1.a Read MAX_BUFF chunks and store in new files
// TODO: Change from vec to storing into a folder
vector<string> fileNames;
const clock_t begin_time = clock();
for(int i = 0; i < inputFiles.size(); i++){
cout << "Breaking " << inputFiles[i] << " into chunks" << endl;
splitIntoChunks(inputFiles[i], fileNames);
}
cout << "Combining chunks" << endl;
cout << "Total Number of Ints to combine " << TOTAL_NUMBERS << endl;
combineChunks(fileNames);
cout << "Total time taken :" << float(clock() - begin_time) / CLOCKS_PER_SEC << endl;
return 0;
}