@@ -14,7 +14,7 @@ namespace blit
1414
1515struct parameters_t
1616{
17- float32_t3 fScale;
17+ float32_t3 fScale; //
1818 float32_t3 negativeSupport;
1919 float32_t referenceAlpha;
2020 uint32_t kernelWeightsOffsetY;
@@ -24,17 +24,15 @@ struct parameters_t
2424
2525 uint16_t3 inputDims;
2626 uint16_t3 outputDims;
27- uint16_t3 windowDims;
27+ uint16_t3 windowDims; //
2828 uint16_t3 phaseCount;
29- uint16_t3 preloadRegion;
29+ uint16_t3 preloadRegion; //
3030 uint16_t3 iterationRegionXPrefixProducts;
3131 uint16_t3 iterationRegionYPrefixProducts;
3232 uint16_t3 iterationRegionZPrefixProducts;
3333
34- //! Offset into the shared memory array which tells us from where the second buffer of shared memory begins
35- //! Given by max(memory_for_preload_region, memory_for_result_of_y_pass)
36- uint16_t secondScratchOffset;
37- uint16_t outputTexelsPerWGZ;
34+ uint16_t secondScratchOffset; //
35+ uint16_t outputTexelsPerWGZ; //
3836
3937 uint32_t3 getOutputTexelsPerWG ()
4038 {
@@ -44,36 +42,79 @@ struct parameters_t
4442 }
4543};
4644
47- struct parameters2_t
45+ // We do some dumb things with bitfields here like not using `vector<uint16_t,N>`, because AMD doesn't support them in push constants
46+ struct SPerWorkgroup
4847{
49- float32_t3 fScale;
50- float32_t3 negativeSupportMinusHalf;
51- float32_t referenceAlpha;
52- uint32_t kernelWeightsOffsetY;
53- uint32_t kernelWeightsOffsetZ;
54- uint32_t inPixelCount;
55- uint32_t outPixelCount;
48+ static inline SPerWorkgroup create (const float32_t3 _scale, const uint16_t3 output, const uint16_t3 preload, const uint16_t _otherPreloadOffset)
49+ {
50+ SPerWorkgroup retval;
51+ retval.scale = _scale;
52+ retval.preloadWidth = preload[0 ];
53+ retval.preloadHeight = preload[1 ];
54+ retval.preloadDepth = preload[2 ];
55+ retval.outputWidth = output[0 ];
56+ retval.outputHeight = output[1 ];
57+ retval.outputDepth = output[2 ];
58+ retval.otherPreloadOffset = _otherPreloadOffset;
59+ return retval;
60+ }
5661
57- uint16_t3 inputDims;
58- uint16_t3 outputDims;
59- uint16_t3 windowDims;
60- uint16_t3 phaseCount;
61- uint16_t3 preloadRegion;
62- uint16_t3 iterationRegionXPrefixProducts;
63- uint16_t3 iterationRegionYPrefixProducts;
64- uint16_t3 iterationRegionZPrefixProducts;
62+ inline uint16_t3 getOutput () NBL_CONST_MEMBER_FUNC
63+ {
64+ return uint16_t3 (outputWidth,outputHeight,outputDepth);
65+ }
66+
67+ inline uint16_t3 getWorkgroupCount (const uint16_t3 outExtent, const uint16_t layersToBlit=0 ) NBL_CONST_MEMBER_FUNC
68+ {
69+ uint16_t3 retval = uint16_t3 (1 ,1 ,1 );
70+ retval += (outExtent-uint16_t3 (1 ,1 ,1 ))/getOutput ();
71+ if (layersToBlit)
72+ retval[3 ] = layersToBlit;
73+ return retval;
74+ }
6575
76+ #ifndef __HLSL_VERSION
77+ inline operator bool () const
78+ {
79+ return outputWidth && outputHeight && outputDepth && preloadWidth && preloadHeight && preloadDepth;
80+ }
81+ #endif
82+
83+ // ratio of input pixels to output
84+ float32_t3 scale;
85+ // 16bit in each dimension because some GPUs actually have enough shared memory for 32k pixels
86+ uint32_t outputWidth : 16 ;
87+ uint32_t outputHeight : 16 ;
88+ uint32_t outputDepth : 16 ;
89+ uint32_t unused0 : 16 ; // channel, image type, iterationRegionPrefixSums ?
90+ uint32_t preloadWidth : 16 ;
91+ uint32_t preloadHeight : 16 ;
92+ uint32_t preloadDepth : 16 ;
6693 //! Offset into the shared memory array which tells us from where the second buffer of shared memory begins
6794 //! Given by max(memory_for_preload_region, memory_for_result_of_y_pass)
68- uint16_t secondScratchOffset ;
69- uint16_t outputTexelsPerWGZ ;
95+ uint32_t otherPreloadOffset : 16 ;
96+ } ;
7097
71- uint32_t3 getOutputTexelsPerWG ()
98+ struct Parameters
99+ {
100+ static Parameters create (
101+ const SPerWorkgroup perWG,
102+ const uint16_t3 inImageExtent, const uint16_t3 outImageExtent
103+ )
72104 {
73- //! `outputTexelsPerWG.xy` just happens to be in the first components of `iterationRegionsXPrefixProducts` and `iterationRegionYPrefixProducts` --this is
74- //! the result of how we choose to iterate, i.e. if, in the future, we decide to iterate differently, this needs to change.
75- return uint32_t3 (iterationRegionXPrefixProducts.x, iterationRegionYPrefixProducts.x, outputTexelsPerWGZ) ;
105+ Parameters retval;
106+ retval.perWG = perWG;
107+ return retval ;
76108 }
109+
110+ SPerWorkgroup perWG;
111+ // general settings
112+ uint32_t lastChannel : 2 ;
113+ uint32_t coverage : 1 ;
114+ uint32_t unused : 29 ;
115+ //! coverage settings
116+ // required to compare the atomic count of passing pixels against, so we can get original coverage
117+ uint32_t inPixelCount;
77118};
78119
79120
0 commit comments