|
41 | 41 | } \ |
42 | 42 | } |
43 | 43 |
|
44 | | -// dim_opt: number of elements in 512 bits. |
45 | | -// i.e. 512/sizeof(type): FP32 = 16, FP64 = 8, BF16 = 32 ... |
46 | | -// The is the number of elements calculated in each distance function loop, |
47 | | -// regardless of the arch optimization type. |
48 | | -// Run each function for {1, 4, 16} iterations. |
49 | | -#define EXACT_512BIT_PARAMS(dim_opt) RangeMultiplier(4)->Range(dim_opt, 400) |
50 | | - |
51 | | -// elem_per_128_bits: dim_opt / 4. FP32 = 4, FP64 = 2, BF16 = 8... |
52 | | -// Dimensions to test 128 bit chunks. |
53 | | -// Run each function at least one full 512 bits iteration + 1/2/3 iterations of 128 bit chunks. |
54 | | -#define EXACT_128BIT_PARAMS(elem_per_128_bits) \ |
55 | | - DenseRange(128 + elem_per_128_bits, 128 + 3 * elem_per_128_bits, elem_per_128_bits) |
56 | | - |
57 | | -// Run each function at least one full 512 bits iteration + (1 * elements : elem_per_128_bits * |
58 | | -// elements) FP32 = residual = 1,2,3, FP64 = residual = 1, BF16 = residual = 1,2,3,4,5,6,7... |
59 | | -#define RESIDUAL_PARAMS(elem_per_128_bits) DenseRange(128 + 1, 128 + elem_per_128_bits - 1, 1) |
60 | | - |
61 | 44 | #define INITIALIZE_BM(bm_class, type_prefix, arch, metric, bm_name, arch_supported) \ |
62 | 45 | BENCHMARK_DISTANCE_F(bm_class, type_prefix, arch, metric, bm_name, arch_supported) \ |
63 | 46 | BENCHMARK_REGISTER_F(bm_class, type_prefix##_##arch##_##metric##_##bm_name) \ |
64 | 47 | ->ArgName("Dimension") \ |
65 | 48 | ->Unit(benchmark::kNanosecond) |
66 | 49 |
|
| 50 | +/** |
| 51 | + * A number that is |
| 52 | + * 1. divisible by 32 to ensure that we have at least one full 512 bits iteration in all types |
| 53 | + * 2. higher than the minimum dimension requires to choose all possible optimizations. |
| 54 | + * (currently it's 500 for IP with AVX512_FP16) |
| 55 | + */ |
| 56 | +static constexpr size_t min_no_res_th_dim = 512; |
| 57 | + |
| 58 | +/** |
| 59 | + * @param dim_opt: Number of elements in 512 bits. |
| 60 | + */ |
| 61 | + |
| 62 | +/** |
| 63 | + * @param dim_opt is also, the smallest dimension to satisfy: |
| 64 | + * dim % num_elements_in_512_bits == 0. |
| 65 | + * We use it to start this set of BM from the smallest dimension that satisfies the above condition. |
| 66 | + * RangeMultiplier(val)->Range(start, end) generates powers of `val` in the range [start, end], |
| 67 | + * including `start` and `end`. |
| 68 | + */ |
67 | 69 | #define INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, metric, dim_opt, arch_supported) \ |
68 | 70 | INITIALIZE_BM(bm_class, type_prefix, arch, metric, 512_bit_chunks, arch_supported) \ |
69 | | - ->EXACT_512BIT_PARAMS(dim_opt) |
70 | | - |
71 | | -#define INITIALIZE_EXACT_128BIT_BM(bm_class, type_prefix, arch, metric, dim_opt, arch_supported) \ |
72 | | - INITIALIZE_BM(bm_class, type_prefix, arch, metric, 128_bit_chunks, arch_supported) \ |
73 | | - ->EXACT_128BIT_PARAMS(dim_opt / 4) |
| 71 | + ->RangeMultiplier(4) \ |
| 72 | + ->Range(dim_opt, 1024) |
74 | 73 |
|
| 74 | +/** for `start` = min_no_res_th_dim (defined above) we run bm for all dimensions |
| 75 | + * in the following range: (start, start + 1, start + 2, start + 3, ... start + dim_opt) |
| 76 | + * to test all possible residual cases. |
| 77 | + */ |
| 78 | +static constexpr size_t start = min_no_res_th_dim; |
75 | 79 | #define INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, metric, dim_opt, arch_supported) \ |
76 | 80 | INITIALIZE_BM(bm_class, type_prefix, arch, metric, residual, arch_supported) \ |
77 | | - ->RESIDUAL_PARAMS(dim_opt / 4) |
| 81 | + ->DenseRange(start + 1, start + dim_opt - 1, 1) |
78 | 82 |
|
| 83 | +/** Test high dim |
| 84 | + * This range satisfies at least one full 512 bits iteration in all types. |
| 85 | + */ |
79 | 86 | #define INITIALIZE_HIGH_DIM(bm_class, type_prefix, arch, metric, arch_supported) \ |
80 | 87 | INITIALIZE_BM(bm_class, type_prefix, arch, metric, high_dim, arch_supported) \ |
81 | 88 | ->DenseRange(900, 1000, 15) |
82 | 89 |
|
83 | | -// Naive algorithms |
| 90 | +/** Test low dim |
| 91 | + * This range satisfies at least one full 512-bit iteration in all types (160). |
| 92 | + */ |
| 93 | +#define INITIALIZE_LOW_DIM(bm_class, type_prefix, arch, metric, arch_supported) \ |
| 94 | + INITIALIZE_BM(bm_class, type_prefix, arch, metric, low_dim, arch_supported) \ |
| 95 | + ->DenseRange(100, 200, 15) |
84 | 96 |
|
| 97 | +/* Naive algorithms */ |
85 | 98 | #define BENCHMARK_DEFINE_NAIVE(bm_class, type_prefix, metric) \ |
86 | 99 | BENCHMARK_DEFINE_F(bm_class, type_prefix##_NAIVE_##metric) \ |
87 | 100 | (benchmark::State & st) { \ |
|
102 | 115 |
|
103 | 116 | #define INITIALIZE_BENCHMARKS_SET_L2(bm_class, type_prefix, arch, dim_opt, arch_supported) \ |
104 | 117 | INITIALIZE_HIGH_DIM(bm_class, type_prefix, arch, L2, arch_supported); \ |
105 | | - INITIALIZE_EXACT_128BIT_BM(bm_class, type_prefix, arch, L2, dim_opt, arch_supported); \ |
| 118 | + INITIALIZE_LOW_DIM(bm_class, type_prefix, arch, L2, arch_supported); \ |
106 | 119 | INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, L2, dim_opt, arch_supported); \ |
107 | 120 | INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, L2, dim_opt, arch_supported); |
108 | 121 |
|
109 | 122 | #define INITIALIZE_BENCHMARKS_SET_IP(bm_class, type_prefix, arch, dim_opt, arch_supported) \ |
110 | 123 | INITIALIZE_HIGH_DIM(bm_class, type_prefix, arch, IP, arch_supported); \ |
111 | | - INITIALIZE_EXACT_128BIT_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported); \ |
| 124 | + INITIALIZE_LOW_DIM(bm_class, type_prefix, arch, IP, arch_supported); \ |
112 | 125 | INITIALIZE_EXACT_512BIT_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported); \ |
113 | 126 | INITIALIZE_RESIDUAL_BM(bm_class, type_prefix, arch, IP, dim_opt, arch_supported); |
114 | 127 |
|
|
0 commit comments