Skip to content

Commit 40b1a9c

Browse files
committed
FidelityFX Parallel Sort v1.1
1 parent 0ab7e34 commit 40b1a9c

33 files changed

+4024
-1420
lines changed

.gitlab-ci.yml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
variables:
2-
SampleName: FfxParallelSort
2+
SampleName: FFX_ParallelSort
33
GIT_SUBMODULE_STRATEGY: normal
4+
45
stages:
56
- build
67
- deploy
8+
79
build_dx12:
810
tags:
911
- windows
@@ -15,17 +17,34 @@ build_dx12:
1517
artifacts:
1618
paths:
1719
- sample/bin/
20+
21+
build_vk:
22+
tags:
23+
- windows
24+
- amd64
25+
stage: build
26+
script:
27+
- 'cmake -S sample -B sample/build/VK -G "Visual Studio 16 2019" -A x64 -DGFX_API=VK -DBUILD_INSTALLER=ON'
28+
- 'cmake --build sample/build/VK --config Release'
29+
artifacts:
30+
paths:
31+
- sample/bin/
32+
1833
package_sample:
1934
tags:
2035
- windows
2136
- amd64
2237
stage: deploy
2338
dependencies:
2439
- build_dx12
40+
- build_vk
2541
script:
2642
- echo "Packaging build"
43+
- copy %VULKAN_SDK%\Bin\glslc.exe .\sample\bin
2744
- echo cd .\sample\bin\ > %SampleName%_DX12.bat
2845
- echo start %SampleName%_DX12.exe >> %SampleName%_DX12.bat
46+
- echo cd .\sample\bin\ > %SampleName%_VK.bat
47+
- echo start %SampleName%_VK.exe >> %SampleName%_VK.bat
2948
artifacts:
3049
name: "%SampleName%-%CI_COMMIT_TAG%-%CI_COMMIT_REF_NAME%-%CI_COMMIT_SHORT_SHA%"
3150
paths:
@@ -35,3 +54,4 @@ package_sample:
3554
- "readme.md"
3655
- "license.txt"
3756
- "%SampleName%_DX12.bat"
57+
- "%SampleName%_VK.bat"

LICENSE.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
1+
Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
22

33
Permission is hereby granted, free of charge, to any person obtaining a copy
44
of this software and associated documentation files (the "Software"), to deal
@@ -16,4 +16,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1616
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1717
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1818
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19-
THE SOFTWARE.
19+
THE SOFTWARE.

README.md

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,37 @@
11
# FidelityFX Parallel Sort
22

3+
Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in
13+
all copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21+
THE SOFTWARE.
22+
23+
## Parallel Sort
24+
325
The AMD FidelityFX Parallel Sort provides an open source header implementation to easily integrate a highly optimized compute-based radix sort into your game.
426

527
Features of the implementation:
628

7-
- Direct and Indirect execution support
29+
- Direct and indirect execution support
830
- RDNA+ optimized algorithm
9-
- Support for DirectX 12
31+
- Support for the Vulkan and Direct3D 12 APIs
1032
- Shaders written in HLSL utilizing SM 6.0 wave-level operations
11-
- DirectX 12 sample
33+
- A sample application is provided for both Direct3D 12 and Vulkan
1234

13-
Resources:
35+
## Resources
1436

15-
Introduction to GPU Radix Sort - http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/RadixSort.pdf
37+
[Introduction to GPU Radix Sort](http://www.heterogeneouscompute.org/wordpress/wp-content/uploads/2011/06/RadixSort.pdf)

ffx-parallelsort/FFX_ParallelSort.h

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -97,12 +97,12 @@
9797
uint NumScanValues;
9898
};
9999

100-
groupshared uint gs_Histogram[FFX_PARALLELSORT_THREADGROUP_SIZE * FFX_PARALLELSORT_SORT_BIN_COUNT];
100+
groupshared uint gs_FFX_PARALLELSORT_Histogram[FFX_PARALLELSORT_THREADGROUP_SIZE * FFX_PARALLELSORT_SORT_BIN_COUNT];
101101
void FFX_ParallelSort_Count_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer<uint> SrcBuffer, RWStructuredBuffer<uint> SumTable)
102102
{
103103
// Start by clearing our local counts in LDS
104104
for (int i = 0; i < FFX_PARALLELSORT_SORT_BIN_COUNT; i++)
105-
gs_Histogram[(i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID] = 0;
105+
gs_FFX_PARALLELSORT_Histogram[(i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID] = 0;
106106

107107
// Wait for everyone to catch up
108108
GroupMemoryBarrierWithGroupSync();
@@ -141,7 +141,7 @@
141141
if (DataIndex < CBuffer.NumKeys)
142142
{
143143
uint localKey = (srcKeys[i] >> ShiftBit) & 0xf;
144-
InterlockedAdd(gs_Histogram[(localKey * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID], 1);
144+
InterlockedAdd(gs_FFX_PARALLELSORT_Histogram[(localKey * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID], 1);
145145
DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE;
146146
}
147147
}
@@ -156,13 +156,13 @@
156156
uint sum = 0;
157157
for (int i = 0; i < FFX_PARALLELSORT_THREADGROUP_SIZE; i++)
158158
{
159-
sum += gs_Histogram[localID * FFX_PARALLELSORT_THREADGROUP_SIZE + i];
159+
sum += gs_FFX_PARALLELSORT_Histogram[localID * FFX_PARALLELSORT_THREADGROUP_SIZE + i];
160160
}
161161
SumTable[localID * CBuffer.NumThreadGroups + groupID] = sum;
162162
}
163163
}
164164

165-
groupshared uint gs_LDSSums[FFX_PARALLELSORT_THREADGROUP_SIZE];
165+
groupshared uint gs_FFX_PARALLELSORT_LDSSums[FFX_PARALLELSORT_THREADGROUP_SIZE];
166166
uint FFX_ParallelSort_ThreadgroupReduce(uint localSum, uint localID)
167167
{
168168
// Do wave local reduce
@@ -172,14 +172,14 @@
172172
// Note that some hardware with very small HW wave sizes (i.e. <= 8) may exhibit issues with this algorithm, and have not been tested.
173173
uint waveID = localID / WaveGetLaneCount();
174174
if (WaveIsFirstLane())
175-
gs_LDSSums[waveID] = waveReduced;
175+
gs_FFX_PARALLELSORT_LDSSums[waveID] = waveReduced;
176176

177177
// Wait for everyone to catch up
178178
GroupMemoryBarrierWithGroupSync();
179179

180180
// First wave worth of threads sum up wave reductions
181181
if (!waveID)
182-
waveReduced = WaveActiveSum( (localID < FFX_PARALLELSORT_THREADGROUP_SIZE / WaveGetLaneCount()) ? gs_LDSSums[localID] : 0);
182+
waveReduced = WaveActiveSum( (localID < FFX_PARALLELSORT_THREADGROUP_SIZE / WaveGetLaneCount()) ? gs_FFX_PARALLELSORT_LDSSums[localID] : 0);
183183

184184
// Returned the reduced sum
185185
return waveReduced;
@@ -196,20 +196,20 @@
196196

197197
// Last element in a wave writes out partial sum to LDS
198198
if (laneID == WaveGetLaneCount() - 1)
199-
gs_LDSSums[waveID] = wavePrefixed + localSum;
199+
gs_FFX_PARALLELSORT_LDSSums[waveID] = wavePrefixed + localSum;
200200

201201
// Wait for everyone to catch up
202202
GroupMemoryBarrierWithGroupSync();
203203

204204
// First wave prefixes partial sums
205205
if (!waveID)
206-
gs_LDSSums[localID] = WavePrefixSum(gs_LDSSums[localID]);
206+
gs_FFX_PARALLELSORT_LDSSums[localID] = WavePrefixSum(gs_FFX_PARALLELSORT_LDSSums[localID]);
207207

208208
// Wait for everyone to catch up
209209
GroupMemoryBarrierWithGroupSync();
210210

211211
// Add the partial sums back to each wave prefix
212-
wavePrefixed += gs_LDSSums[waveID];
212+
wavePrefixed += gs_FFX_PARALLELSORT_LDSSums[waveID];
213213

214214
return wavePrefixed;
215215
}
@@ -244,7 +244,7 @@
244244

245245
// This is to transform uncoalesced loads into coalesced loads and
246246
// then scattered loads from LDS
247-
groupshared int gs_LDS[FFX_PARALLELSORT_ELEMENTS_PER_THREAD][FFX_PARALLELSORT_THREADGROUP_SIZE];
247+
groupshared int gs_FFX_PARALLELSORT_LDS[FFX_PARALLELSORT_ELEMENTS_PER_THREAD][FFX_PARALLELSORT_THREADGROUP_SIZE];
248248
void FFX_ParallelSort_ScanPrefix(uint numValuesToScan, uint localID, uint groupID, uint BinOffset, uint BaseIndex, bool AddPartialSums,
249249
FFX_ParallelSortCB CBuffer, RWStructuredBuffer<uint> ScanSrc, RWStructuredBuffer<uint> ScanDst, RWStructuredBuffer<uint> ScanScratch)
250250
{
@@ -255,7 +255,7 @@
255255

256256
uint col = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) / FFX_PARALLELSORT_ELEMENTS_PER_THREAD;
257257
uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD;
258-
gs_LDS[row][col] = (DataIndex < numValuesToScan) ? ScanSrc[BinOffset + DataIndex] : 0;
258+
gs_FFX_PARALLELSORT_LDS[row][col] = (DataIndex < numValuesToScan) ? ScanSrc[BinOffset + DataIndex] : 0;
259259
}
260260

261261
// Wait for everyone to catch up
@@ -265,8 +265,8 @@
265265
// Calculate the local scan-prefix for current thread
266266
for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++)
267267
{
268-
uint tmp = gs_LDS[i][localID];
269-
gs_LDS[i][localID] = threadgroupSum;
268+
uint tmp = gs_FFX_PARALLELSORT_LDS[i][localID];
269+
gs_FFX_PARALLELSORT_LDS[i][localID] = threadgroupSum;
270270
threadgroupSum += tmp;
271271
}
272272

@@ -284,7 +284,7 @@
284284

285285
// Add the block scanned-prefixes back in
286286
for (uint i = 0; i < FFX_PARALLELSORT_ELEMENTS_PER_THREAD; i++)
287-
gs_LDS[i][localID] += threadgroupSum;
287+
gs_FFX_PARALLELSORT_LDS[i][localID] += threadgroupSum;
288288

289289
// Wait for everyone to catch up
290290
GroupMemoryBarrierWithGroupSync();
@@ -298,25 +298,25 @@
298298
uint row = ((i * FFX_PARALLELSORT_THREADGROUP_SIZE) + localID) % FFX_PARALLELSORT_ELEMENTS_PER_THREAD;
299299

300300
if (DataIndex < numValuesToScan)
301-
ScanDst[BinOffset + DataIndex] = gs_LDS[row][col] + partialSum;
301+
ScanDst[BinOffset + DataIndex] = gs_FFX_PARALLELSORT_LDS[row][col] + partialSum;
302302
}
303303
}
304304

305305
// Offset cache to avoid loading the offsets all the time
306-
groupshared uint gs_BinOffsetCache[FFX_PARALLELSORT_THREADGROUP_SIZE];
306+
groupshared uint gs_FFX_PARALLELSORT_BinOffsetCache[FFX_PARALLELSORT_THREADGROUP_SIZE];
307307
// Local histogram for offset calculations
308-
groupshared uint gs_LocalHistogram[FFX_PARALLELSORT_SORT_BIN_COUNT];
308+
groupshared uint gs_FFX_PARALLELSORT_LocalHistogram[FFX_PARALLELSORT_SORT_BIN_COUNT];
309309
// Scratch area for algorithm
310-
groupshared uint gs_LDSScratch[FFX_PARALLELSORT_THREADGROUP_SIZE];
310+
groupshared uint gs_FFX_PARALLELSORT_LDSScratch[FFX_PARALLELSORT_THREADGROUP_SIZE];
311311
void FFX_ParallelSort_Scatter_uint(uint localID, uint groupID, FFX_ParallelSortCB CBuffer, uint ShiftBit, RWStructuredBuffer<uint> SrcBuffer, RWStructuredBuffer<uint> DstBuffer, RWStructuredBuffer<uint> SumTable
312312
#ifdef kRS_ValueCopy
313-
,RWStructuredBuffer<uint> SrcPayload, RWStructuredBuffer<uint> DstPayload
313+
,RWStructuredBuffer<uint> SrcPayload, RWStructuredBuffer<uint> DstPayload
314314
#endif // kRS_ValueCopy
315315
)
316316
{
317317
// Load the sort bin threadgroup offsets into LDS for faster referencing
318318
if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT)
319-
gs_BinOffsetCache[localID] = SumTable[localID * CBuffer.NumThreadGroups + groupID];
319+
gs_FFX_PARALLELSORT_BinOffsetCache[localID] = SumTable[localID * CBuffer.NumThreadGroups + groupID];
320320

321321
// Wait for everyone to catch up
322322
GroupMemoryBarrierWithGroupSync();
@@ -363,7 +363,7 @@
363363
{
364364
// Clear the local histogram
365365
if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT)
366-
gs_LocalHistogram[localID] = 0;
366+
gs_FFX_PARALLELSORT_LocalHistogram[localID] = 0;
367367

368368
uint localKey = (DataIndex < CBuffer.NumKeys ? srcKeys[i] : 0xffffffff);
369369
#ifdef kRS_ValueCopy
@@ -386,13 +386,13 @@
386386
// Last thread stores the updated histogram counts for the thread group
387387
// Scratch = 0xsum3|sum2|sum1|sum0 for thread group
388388
if (localID == (FFX_PARALLELSORT_THREADGROUP_SIZE - 1))
389-
gs_LDSScratch[0] = localSum + packedHistogram;
389+
gs_FFX_PARALLELSORT_LDSScratch[0] = localSum + packedHistogram;
390390

391391
// Wait for everyone to catch up
392392
GroupMemoryBarrierWithGroupSync();
393393

394394
// Load the sums value for the thread group
395-
packedHistogram = gs_LDSScratch[0];
395+
packedHistogram = gs_FFX_PARALLELSORT_LDSScratch[0];
396396

397397
// Add prefix offsets for all 4 bit "keys" (packedHistogram = 0xsum2_1_0|sum1_0|sum0|0)
398398
packedHistogram = (packedHistogram << 8) + (packedHistogram << 16) + (packedHistogram << 24);
@@ -404,18 +404,18 @@
404404
uint keyOffset = (localSum >> (bitKey * 8)) & 0xff;
405405

406406
// Re-arrange the keys (store, sync, load)
407-
gs_LDSSums[keyOffset] = localKey;
407+
gs_FFX_PARALLELSORT_LDSSums[keyOffset] = localKey;
408408
GroupMemoryBarrierWithGroupSync();
409-
localKey = gs_LDSSums[localID];
409+
localKey = gs_FFX_PARALLELSORT_LDSSums[localID];
410410

411411
// Wait for everyone to catch up
412412
GroupMemoryBarrierWithGroupSync();
413413

414414
#ifdef kRS_ValueCopy
415415
// Re-arrange the values if we have them (store, sync, load)
416-
gs_LDSSums[keyOffset] = localValue;
416+
gs_FFX_PARALLELSORT_LDSSums[keyOffset] = localValue;
417417
GroupMemoryBarrierWithGroupSync();
418-
localValue = gs_LDSSums[localID];
418+
localValue = gs_FFX_PARALLELSORT_LDSSums[localID];
419419

420420
// Wait for everyone to catch up
421421
GroupMemoryBarrierWithGroupSync();
@@ -426,26 +426,26 @@
426426
uint keyIndex = (localKey >> ShiftBit) & 0xf;
427427

428428
// Reconstruct histogram
429-
InterlockedAdd(gs_LocalHistogram[keyIndex], 1);
429+
InterlockedAdd(gs_FFX_PARALLELSORT_LocalHistogram[keyIndex], 1);
430430

431431
// Wait for everyone to catch up
432432
GroupMemoryBarrierWithGroupSync();
433433

434434
// Prefix histogram
435-
uint histogramPrefixSum = WavePrefixSum(localID < FFX_PARALLELSORT_SORT_BIN_COUNT ? gs_LocalHistogram[localID] : 0);
435+
uint histogramPrefixSum = WavePrefixSum(localID < FFX_PARALLELSORT_SORT_BIN_COUNT ? gs_FFX_PARALLELSORT_LocalHistogram[localID] : 0);
436436

437437
// Broadcast prefix-sum via LDS
438438
if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT)
439-
gs_LDSScratch[localID] = histogramPrefixSum;
439+
gs_FFX_PARALLELSORT_LDSScratch[localID] = histogramPrefixSum;
440440

441441
// Get the global offset for this key out of the cache
442-
uint globalOffset = gs_BinOffsetCache[keyIndex];
442+
uint globalOffset = gs_FFX_PARALLELSORT_BinOffsetCache[keyIndex];
443443

444444
// Wait for everyone to catch up
445445
GroupMemoryBarrierWithGroupSync();
446446

447447
// Get the local offset (at this point the keys are all in increasing order from 0 -> num bins in localID 0 -> thread group size)
448-
uint localOffset = localID - gs_LDSScratch[keyIndex];
448+
uint localOffset = localID - gs_FFX_PARALLELSORT_LDSScratch[keyIndex];
449449

450450
// Write to destination
451451
uint totalOffset = globalOffset + localOffset;
@@ -464,7 +464,7 @@
464464

465465
// Update the cached histogram for the next set of entries
466466
if (localID < FFX_PARALLELSORT_SORT_BIN_COUNT)
467-
gs_BinOffsetCache[localID] += gs_LocalHistogram[localID];
467+
gs_FFX_PARALLELSORT_BinOffsetCache[localID] += gs_FFX_PARALLELSORT_LocalHistogram[localID];
468468

469469
DataIndex += FFX_PARALLELSORT_THREADGROUP_SIZE; // Increase the data offset by thread group size
470470
}

0 commit comments

Comments
 (0)