-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcachebenchmark.cpp
More file actions
104 lines (84 loc) · 3.2 KB
/
Copy pathcachebenchmark.cpp
File metadata and controls
104 lines (84 loc) · 3.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#include "benchmark.hpp"
#include "utils/cpu_affinity.hpp"
#include <iostream>
#include <vector>
#include <random>
#include <algorithm>
#include <cstdint>
// Define buffer sizes
// L1 Data cache is typically 32-48 KB per core
const size_t L1_SIZE = 16 * 1024; // 16 KB (Safe L1 hit)
// L2 cache is typically 256 KB - 1 MB
const size_t L2_SIZE = 512 * 1024; // 512 KB (Exceeds L1, fits in L2)
// Main Memory (well beyond L3)
const size_t RAM_SIZE = 128 * 1024 * 1024; // 128 MB (Cold cache for RAM)
/**
* Measures the average latency of memory access for a given buffer size.
* Uses random access to prevent hardware prefetching from hiding true latency.
*/
void run_cache_bench(const std::string& label, size_t size, int iterations) {
std::cout << "\n>>> Starting Benchmark: " << label << " (Test Size: " << size / 1024 << " KB) <<<\n";
// 1. Allocate and initialize buffer
size_t count = size / sizeof(uint64_t);
std::vector<uint64_t> buffer(count);
for (size_t i = 0; i < count; ++i) buffer[i] = i;
// 2. Prepare random indices to defeat prefetcher
std::vector<size_t> indices(count);
for (size_t i = 0; i < count; ++i) indices[i] = i;
std::shuffle(indices.begin(), indices.end(), std::mt19937(1337));
Bench b(label, label + "_results.txt");
b.start(iterations);
// Use a volatile sink to prevent compiler optimization
volatile uint64_t sum = 0;
// Warm-up to ensure cache is primed (for L1/L2)
for (size_t i = 0; i < std::min<size_t>(count, 10000); ++i) {
sum += buffer[indices[i % count]];
}
// 3. Main benchmark loop
for (int i = 0; i < iterations; ++i) {
size_t idx = indices[i % count];
b.tick();
sum += buffer[idx]; // Load operation
b.tock();
}
b.end();
}
/**
* Measures store speed by performing random writes
*/
void run_store_bench(const std::string& label, size_t size, int iterations) {
std::cout << "\n>>> Starting Benchmark: " << label << " (Store Speed, Test Size: " << size / 1024 << " KB) <<<\n";
size_t count = size / sizeof(uint64_t);
std::vector<uint64_t> buffer(count, 0);
std::vector<size_t> indices(count);
for (size_t i = 0; i < count; ++i) indices[i] = i;
std::shuffle(indices.begin(), indices.end(), std::mt19937(4242));
Bench b(label, label + "_store_results.txt");
b.start(iterations);
for (int i = 0; i < iterations; ++i) {
size_t idx = indices[i % count];
uint64_t val = (uint64_t)i;
b.tick();
buffer[idx] = val; // Store operation
b.tock();
}
b.end();
}
int main() {
// Pin to CPU to prevent context switching jitter
try {
pin_thread_to_cpu(1);
std::cout << "Thread pinned to CPU 1.\n";
} catch (...) {
std::cout << "Warning: Could not pin thread.\n";
}
const int ITERS = 1000000;
// MEASURE LOADS (Latency)
run_cache_bench("L1_Load_Latency", L1_SIZE, ITERS);
run_cache_bench("L2_Load_Latency", L2_SIZE, ITERS);
run_cache_bench("RAM_Load_Latency", RAM_SIZE, ITERS);
// MEASURE STORES
run_store_bench("L1_Store_Latency", L1_SIZE, ITERS);
run_store_bench("RAM_Store_Latency", RAM_SIZE, ITERS);
return 0;
}