|
97 | 97 | uint NumScanValues; |
98 | 98 | }; |
99 | 99 |
|
100 | | - groupshared uint gs_Histogram[FFX_PARALLELSORT_THREADGROUP_SIZE * FFX_PARALLELSORT_SORT_BIN_COUNT]; |
| 100 | + groupshared uint gs_FFX_PARALLELSORT_Histogram[FFX_PARALLELSORT_THREADGROUP_SIZE * FFX_PARALLELSORT_SORT_BIN_COUNT]; |
101 | 101 | void FFX_ParallelSort_Count_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer<uint> SrcBuffer, RWStructuredBuffer<uint> SumTable) |
102 | 102 | { |
103 | 103 | // Start by clearing our local counts in LDS |
104 | 104 | for (int i = 0; i < FFX_PARALLELSORT_SORT_BIN_COUNT; i++) |
105 | | - gs_Histogram[(i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID] = 0; |
| 105 | + gs_FFX_PARALLELSORT_Histogram[(i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID] = 0; |
106 | 106 |
|
107 | 107 | // Wait for everyone to catch up |
108 | 108 | GroupMemoryBarrierWithGroupSync(); |
|
141 | 141 | if (DataIndex < CBuffer.NumKeys) |
142 | 142 | { |
143 | 143 | uint localKey = (srcKeys[i] >> ShiftBit) & 0xf; |
144 | | - InterlockedAdd(gs_Histogram[(localKey * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID], 1); |
| 144 | + InterlockedAdd(gs_FFX_PARALLELSORT_Histogram[(localKey * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID], 1); |
145 | 145 | DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE; |
146 | 146 | } |
147 | 147 | } |
|
156 | 156 | uint sum = 0; |
157 | 157 | for (int i = 0; i < FFX_PARALLELSORT_THREADGROUP_SIZE; i++) |
158 | 158 | { |
159 | | - sum += gs_Histogram[localID * FFX_PARALLELSORT_THREADGROUP_SIZE + i]; |
| 159 | + sum += gs_FFX_PARALLELSORT_Histogram[localID * FFX_PARALLELSORT_THREADGROUP_SIZE + i]; |
160 | 160 | } |
161 | 161 | SumTable[localID * CBuffer.NumThreadGroups + groupID] = sum; |
162 | 162 | } |
163 | 163 | } |
164 | 164 |
|
165 | | - groupshared uint gs_LDSSums[FFX_PARALLELSORT_THREADGROUP_SIZE]; |
| 165 | + groupshared uint gs_FFX_PARALLELSORT_LDSSums[FFX_PARALLELSORT_THREADGROUP_SIZE]; |
166 | 166 | uint FFX_ParallelSort_ThreadgroupReduce(uint localSum, uint localID) |
167 | 167 | { |
168 | 168 | // Do wave local reduce |
|
172 | 172 | // Note that some hardware with very small HW wave sizes (i.e. <= 8) may exhibit issues with this algorithm, and have not been tested. |
173 | 173 | uint waveID = localID / WaveGetLaneCount(); |
174 | 174 | if (WaveIsFirstLane()) |
175 | | - gs_LDSSums[waveID] = waveReduced; |
| 175 | + gs_FFX_PARALLELSORT_LDSSums[waveID] = waveReduced; |
176 | 176 |
|
177 | 177 | // Wait for everyone to catch up |
178 | 178 | GroupMemoryBarrierWithGroupSync(); |
179 | 179 |
|
180 | 180 | // First wave worth of threads sum up wave reductions |
181 | 181 | if (!waveID) |
182 | | - waveReduced = WaveActiveSum( (localID < FFX_PARALLELSORT_THREADGROUP_SIZE / WaveGetLaneCount()) ? gs_LDSSums[localID] : 0); |
| 182 | + waveReduced = WaveActiveSum( (localID < FFX_PARALLELSORT_THREADGROUP_SIZE / WaveGetLaneCount()) ? gs_FFX_PARALLELSORT_LDSSums[localID] : 0); |
183 | 183 |
|
184 | 184 | // Returned the reduced sum |
185 | 185 | return waveReduced; |
|
196 | 196 |
|
197 | 197 | // Last element in a wave writes out partial sum to LDS |
198 | 198 | if (laneID == WaveGetLaneCount() - 1) |
199 | | - gs_LDSSums[waveID] = wavePrefixed + localSum; |
| 199 | + gs_FFX_PARALLELSORT_LDSSums[waveID] = wavePrefixed + localSum; |
200 | 200 |
|
201 | 201 | // Wait for everyone to catch up |
202 | 202 | GroupMemoryBarrierWithGroupSync(); |
203 | 203 |
|
204 | 204 | // First wave prefixes partial sums |
205 | 205 | if (!waveID) |
206 | | - gs_LDSSums[localID] = WavePrefixSum(gs_LDSSums[localID]); |
| 206 | + gs_FFX_PARALLELSORT_LDSSums[localID] = WavePrefixSum(gs_FFX_PARALLELSORT_LDSSums[localID]); |
207 | 207 |
|
208 | 208 | // Wait for everyone to catch up |
209 | 209 | GroupMemoryBarrierWithGroupSync(); |
210 | 210 |
|
211 | 211 | // Add the partial sums back to each wave prefix |
212 | | - wavePrefixed += gs_LDSSums[waveID]; |
| 212 | + wavePrefixed += gs_FFX_PARALLELSORT_LDSSums[waveID]; |
213 | 213 |
|
214 | 214 | return wavePrefixed; |
215 | 215 | } |
|
244 | 244 |
|
245 | 245 | // This is to transform uncoalesced loads into coalesced loads and |
246 | 246 | // then scattered loads from LDS |
247 | | - groupshared int gs_LDS[FFX_PARALLELSORT_ELEMENTS_PER_THREAD][FFX_PARALLELSORT_THREADGROUP_SIZE]; |
| 247 | + groupshared int gs_FFX_PARALLELSORT_LDS[FFX_PARALLELSORT_ELEMENTS_PER_THREAD][FFX_PARALLELSORT_THREADGROUP_SIZE]; |
248 | 248 | void FFX_ParallelSort_ScanPrefix(uint numValuesToScan, uint localID, uint groupID, uint BinOffset, uint BaseIndex, bool AddPartialSums, |
249 | 249 | FFX_ParallelSortCB CBuffer, RWStructuredBuffer<uint> ScanSrc, RWStructuredBuffer<uint> ScanDst, RWStructuredBuffer<uint> ScanScratch) |
250 | 250 | { |
|
255 | 255 |
|
256 | 256 | uint col = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) / FFX_PARALLELSORT_ELEMENTS_PER_THREAD; |
257 | 257 | uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD; |
258 | | - gs_LDS[row][col] = (DataIndex < numValuesToScan) ? ScanSrc[BinOffset + DataIndex] : 0; |
| 258 | + gs_FFX_PARALLELSORT_LDS[row][col] = (DataIndex < numValuesToScan) ? ScanSrc[BinOffset + DataIndex] : 0; |
259 | 259 | } |
260 | 260 |
|
261 | 261 | // Wait for everyone to catch up |
|
265 | 265 | // Calculate the local scan-prefix for current thread |
266 | 266 | for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) |
267 | 267 | { |
268 | | - uint tmp = gs_LDS[i][localID]; |
269 | | - gs_LDS[i][localID] = threadgroupSum; |
| 268 | + uint tmp = gs_FFX_PARALLELSORT_LDS[i][localID]; |
| 269 | + gs_FFX_PARALLELSORT_LDS[i][localID] = threadgroupSum; |
270 | 270 | threadgroupSum += tmp; |
271 | 271 | } |
272 | 272 |
|
|
284 | 284 |
|
285 | 285 | // Add the block scanned-prefixes back in |
286 | 286 | for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++) |
287 | | - gs_LDS[i][localID] += threadgroupSum; |
| 287 | + gs_FFX_PARALLELSORT_LDS[i][localID] += threadgroupSum; |
288 | 288 |
|
289 | 289 | // Wait for everyone to catch up |
290 | 290 | GroupMemoryBarrierWithGroupSync(); |
|
298 | 298 | uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD; |
299 | 299 |
|
300 | 300 | if (DataIndex < numValuesToScan) |
301 | | - ScanDst[BinOffset + DataIndex] = gs_LDS[row][col] + partialSum; |
| 301 | + ScanDst[BinOffset + DataIndex] = gs_FFX_PARALLELSORT_LDS[row][col] + partialSum; |
302 | 302 | } |
303 | 303 | } |
304 | 304 |
|
305 | 305 | // Offset cache to avoid loading the offsets all the time |
306 | | - groupshared uint gs_BinOffsetCache[FFX_PARALLELSORT_THREADGROUP_SIZE]; |
| 306 | + groupshared uint gs_FFX_PARALLELSORT_BinOffsetCache[FFX_PARALLELSORT_THREADGROUP_SIZE]; |
307 | 307 | // Local histogram for offset calculations |
308 | | - groupshared uint gs_LocalHistogram[FFX_PARALLELSORT_SORT_BIN_COUNT]; |
| 308 | + groupshared uint gs_FFX_PARALLELSORT_LocalHistogram[FFX_PARALLELSORT_SORT_BIN_COUNT]; |
309 | 309 | // Scratch area for algorithm |
310 | | - groupshared uint gs_LDSScratch[FFX_PARALLELSORT_THREADGROUP_SIZE]; |
| 310 | + groupshared uint gs_FFX_PARALLELSORT_LDSScratch[FFX_PARALLELSORT_THREADGROUP_SIZE]; |
311 | 311 | void FFX_ParallelSort_Scatter_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer<uint> SrcBuffer, RWStructuredBuffer<uint> DstBuffer, RWStructuredBuffer<uint> SumTable |
312 | 312 | #ifdef kRS_ValueCopy |
313 | | - ,RWStructuredBuffer<uint> SrcPayload, RWStructuredBuffer<uint> DstPayload |
| 313 | + ,RWStructuredBuffer<uint> SrcPayload, RWStructuredBuffer<uint> DstPayload |
314 | 314 | #endif // kRS_ValueCopy |
315 | 315 | ) |
316 | 316 | { |
317 | 317 | // Load the sort bin threadgroup offsets into LDS for faster referencing |
318 | 318 | if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) |
319 | | - gs_BinOffsetCache[localID] = SumTable[localID * CBuffer.NumThreadGroups + groupID]; |
| 319 | + gs_FFX_PARALLELSORT_BinOffsetCache[localID] = SumTable[localID * CBuffer.NumThreadGroups + groupID]; |
320 | 320 |
|
321 | 321 | // Wait for everyone to catch up |
322 | 322 | GroupMemoryBarrierWithGroupSync(); |
|
363 | 363 | { |
364 | 364 | // Clear the local histogram |
365 | 365 | if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) |
366 | | - gs_LocalHistogram[localID] = 0; |
| 366 | + gs_FFX_PARALLELSORT_LocalHistogram[localID] = 0; |
367 | 367 |
|
368 | 368 | uint localKey = (DataIndex < CBuffer.NumKeys ? srcKeys[i] : 0xffffffff); |
369 | 369 | #ifdef kRS_ValueCopy |
|
386 | 386 | // Last thread stores the updated histogram counts for the thread group |
387 | 387 | // Scratch = 0xsum3|sum2|sum1|sum0 for thread group |
388 | 388 | if (localID == (FFX_PARALLELSORT_THREADGROUP_SIZE - 1)) |
389 | | - gs_LDSScratch[0] = localSum + packedHistogram; |
| 389 | + gs_FFX_PARALLELSORT_LDSScratch[0] = localSum + packedHistogram; |
390 | 390 |
|
391 | 391 | // Wait for everyone to catch up |
392 | 392 | GroupMemoryBarrierWithGroupSync(); |
393 | 393 |
|
394 | 394 | // Load the sums value for the thread group |
395 | | - packedHistogram = gs_LDSScratch[0]; |
| 395 | + packedHistogram = gs_FFX_PARALLELSORT_LDSScratch[0]; |
396 | 396 |
|
397 | 397 | // Add prefix offsets for all 4 bit "keys" (packedHistogram = 0xsum2_1_0|sum1_0|sum0|0) |
398 | 398 | packedHistogram = (packedHistogram << 8) + (packedHistogram << 16) + (packedHistogram << 24); |
|
404 | 404 | uint keyOffset = (localSum >> (bitKey * 8)) & 0xff; |
405 | 405 |
|
406 | 406 | // Re-arrange the keys (store, sync, load) |
407 | | - gs_LDSSums[keyOffset] = localKey; |
| 407 | + gs_FFX_PARALLELSORT_LDSSums[keyOffset] = localKey; |
408 | 408 | GroupMemoryBarrierWithGroupSync(); |
409 | | - localKey = gs_LDSSums[localID]; |
| 409 | + localKey = gs_FFX_PARALLELSORT_LDSSums[localID]; |
410 | 410 |
|
411 | 411 | // Wait for everyone to catch up |
412 | 412 | GroupMemoryBarrierWithGroupSync(); |
413 | 413 |
|
414 | 414 | #ifdef kRS_ValueCopy |
415 | 415 | // Re-arrange the values if we have them (store, sync, load) |
416 | | - gs_LDSSums[keyOffset] = localValue; |
| 416 | + gs_FFX_PARALLELSORT_LDSSums[keyOffset] = localValue; |
417 | 417 | GroupMemoryBarrierWithGroupSync(); |
418 | | - localValue = gs_LDSSums[localID]; |
| 418 | + localValue = gs_FFX_PARALLELSORT_LDSSums[localID]; |
419 | 419 |
|
420 | 420 | // Wait for everyone to catch up |
421 | 421 | GroupMemoryBarrierWithGroupSync(); |
|
426 | 426 | uint keyIndex = (localKey >> ShiftBit) & 0xf; |
427 | 427 |
|
428 | 428 | // Reconstruct histogram |
429 | | - InterlockedAdd(gs_LocalHistogram[keyIndex], 1); |
| 429 | + InterlockedAdd(gs_FFX_PARALLELSORT_LocalHistogram[keyIndex], 1); |
430 | 430 |
|
431 | 431 | // Wait for everyone to catch up |
432 | 432 | GroupMemoryBarrierWithGroupSync(); |
433 | 433 |
|
434 | 434 | // Prefix histogram |
435 | | - uint histogramPrefixSum = WavePrefixSum(localID < FFX_PARALLELSORT_SORT_BIN_COUNT ? gs_LocalHistogram[localID] : 0); |
| 435 | + uint histogramPrefixSum = WavePrefixSum(localID < FFX_PARALLELSORT_SORT_BIN_COUNT ? gs_FFX_PARALLELSORT_LocalHistogram[localID] : 0); |
436 | 436 |
|
437 | 437 | // Broadcast prefix-sum via LDS |
438 | 438 | if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) |
439 | | - gs_LDSScratch[localID] = histogramPrefixSum; |
| 439 | + gs_FFX_PARALLELSORT_LDSScratch[localID] = histogramPrefixSum; |
440 | 440 |
|
441 | 441 | // Get the global offset for this key out of the cache |
442 | | - uint globalOffset = gs_BinOffsetCache[keyIndex]; |
| 442 | + uint globalOffset = gs_FFX_PARALLELSORT_BinOffsetCache[keyIndex]; |
443 | 443 |
|
444 | 444 | // Wait for everyone to catch up |
445 | 445 | GroupMemoryBarrierWithGroupSync(); |
446 | 446 |
|
447 | 447 | // Get the local offset (at this point the keys are all in increasing order from 0 -> num bins in localID 0 -> thread group size) |
448 | | - uint localOffset = localID - gs_LDSScratch[keyIndex]; |
| 448 | + uint localOffset = localID - gs_FFX_PARALLELSORT_LDSScratch[keyIndex]; |
449 | 449 |
|
450 | 450 | // Write to destination |
451 | 451 | uint totalOffset = globalOffset + localOffset; |
|
464 | 464 |
|
465 | 465 | // Update the cached histogram for the next set of entries |
466 | 466 | if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT) |
467 | | - gs_BinOffsetCache[localID] += gs_LocalHistogram[localID]; |
| 467 | + gs_FFX_PARALLELSORT_BinOffsetCache[localID] += gs_FFX_PARALLELSORT_LocalHistogram[localID]; |
468 | 468 |
|
469 | 469 | DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE; // Increase the data offset by thread group size |
470 | 470 | } |
|
0 commit comments