start computing the push constants

devsh · devsh · commit 6d743f3055d4 · 2024-11-08T15:09:07.000+01:00
diff --git a/include/nbl/builtin/hlsl/blit/common.hlsl b/include/nbl/builtin/hlsl/blit/common.hlsl
@@ -44,6 +44,8 @@ RWTexture3D<float4> outAs3D[ConstevalParameters::output_binding_t::Count];
 
 groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs];
 
+[[vk::push_constant]] const nbl::hlsl::blit::SPerWorkgroup pc;
+
 
 #include <nbl/builtin/hlsl/concepts.hlsl>
 /*
diff --git a/include/nbl/builtin/hlsl/blit/compute_blit.hlsl b/include/nbl/builtin/hlsl/blit/compute_blit.hlsl
@@ -77,6 +77,7 @@ struct compute_blit_t
 		uint16_t localInvocationIndex)
 	{
 		const float3 halfScale = scale * float3(0.5f, 0.5f, 0.5f);
+		// bottom of the input tile
 		const uint32_t3 minOutputPixel = workGroupID * outputTexelsPerWG;
 		const float3 minOutputPixelCenterOfWG = float3(minOutputPixel)*scale + halfScale;
 		// this can be negative, in which case HW sampler takes care of wrapping for us
diff --git a/include/nbl/builtin/hlsl/blit/parameters.hlsl b/include/nbl/builtin/hlsl/blit/parameters.hlsl
@@ -14,7 +14,7 @@ namespace blit
 
 struct parameters_t
 {
-	float32_t3 fScale;
+	float32_t3 fScale; //
 	float32_t3 negativeSupport;
 	float32_t referenceAlpha;
 	uint32_t kernelWeightsOffsetY;
@@ -24,17 +24,15 @@ struct parameters_t
 
 	uint16_t3 inputDims;
 	uint16_t3 outputDims;
-	uint16_t3 windowDims;
+	uint16_t3 windowDims; //
 	uint16_t3 phaseCount;
-	uint16_t3 preloadRegion;
+	uint16_t3 preloadRegion; //
 	uint16_t3 iterationRegionXPrefixProducts;
 	uint16_t3 iterationRegionYPrefixProducts;
 	uint16_t3 iterationRegionZPrefixProducts;
 
-	//! Offset into the shared memory array which tells us from where the second buffer of shared memory begins
-	//! Given by max(memory_for_preload_region, memory_for_result_of_y_pass)
-	uint16_t secondScratchOffset;
-	uint16_t outputTexelsPerWGZ;
+	uint16_t secondScratchOffset; //
+	uint16_t outputTexelsPerWGZ; //
 
 	uint32_t3 getOutputTexelsPerWG()
 	{
@@ -44,36 +42,79 @@ struct parameters_t
 	}
 };
 
-struct parameters2_t
+// We do some dumb things with bitfields here like not using `vector<uint16_t,N>`, because AMD doesn't support them in push constants
+struct SPerWorkgroup
 {
-	float32_t3 fScale;
-	float32_t3 negativeSupportMinusHalf;
-	float32_t referenceAlpha;
-	uint32_t kernelWeightsOffsetY;
-	uint32_t kernelWeightsOffsetZ;
-	uint32_t inPixelCount;
-	uint32_t outPixelCount;
+	static inline SPerWorkgroup create(const float32_t3 _scale, const uint16_t3 output, const uint16_t3 preload, const uint16_t _otherPreloadOffset)
+	{
+		SPerWorkgroup retval;
+		retval.scale = _scale;
+		retval.preloadWidth = preload[0];
+		retval.preloadHeight = preload[1];
+		retval.preloadDepth = preload[2];
+		retval.outputWidth = output[0];
+		retval.outputHeight = output[1];
+		retval.outputDepth = output[2];
+		retval.otherPreloadOffset = _otherPreloadOffset;
+		return retval;
+	}
 
-	uint16_t3 inputDims;
-	uint16_t3 outputDims;
-	uint16_t3 windowDims;
-	uint16_t3 phaseCount;
-	uint16_t3 preloadRegion;
-	uint16_t3 iterationRegionXPrefixProducts;
-	uint16_t3 iterationRegionYPrefixProducts;
-	uint16_t3 iterationRegionZPrefixProducts;
+	inline uint16_t3 getOutput() NBL_CONST_MEMBER_FUNC
+	{
+		return uint16_t3(outputWidth,outputHeight,outputDepth);
+	}
+
+	inline uint16_t3 getWorkgroupCount(const uint16_t3 outExtent, const uint16_t layersToBlit=0) NBL_CONST_MEMBER_FUNC
+	{
+		uint16_t3 retval = uint16_t3(1,1,1);
+		retval += (outExtent-uint16_t3(1,1,1))/getOutput();
+		if (layersToBlit)
+			retval[3] = layersToBlit;
+		return retval;
+	}
 
+#ifndef __HLSL_VERSION
+	inline operator bool() const
+	{
+		return outputWidth && outputHeight && outputDepth && preloadWidth && preloadHeight && preloadDepth;
+	}
+#endif
+
+	// ratio of input pixels to output
+	float32_t3 scale;
+	// 16bit in each dimension because some GPUs actually have enough shared memory for 32k pixels
+	uint32_t outputWidth	: 16;
+	uint32_t outputHeight	: 16;
+	uint32_t outputDepth	: 16;
+	uint32_t unused0		: 16; // channel, image type, iterationRegionPrefixSums ?
+	uint32_t preloadWidth		: 16;
+	uint32_t preloadHeight		: 16;
+	uint32_t preloadDepth		: 16;
 	//! Offset into the shared memory array which tells us from where the second buffer of shared memory begins
 	//! Given by max(memory_for_preload_region, memory_for_result_of_y_pass)
-	uint16_t secondScratchOffset;
-	uint16_t outputTexelsPerWGZ;
+	uint32_t otherPreloadOffset	: 16;
+};
 
-	uint32_t3 getOutputTexelsPerWG()
+struct Parameters
+{
+	static Parameters create(
+		const SPerWorkgroup perWG,
+		const uint16_t3 inImageExtent, const uint16_t3 outImageExtent
+	)
 	{
-		//! `outputTexelsPerWG.xy` just happens to be in the first components of `iterationRegionsXPrefixProducts` and `iterationRegionYPrefixProducts` --this is
-		//! the result of how we choose to iterate, i.e. if, in the future, we decide to iterate differently, this needs to change.
-		return uint32_t3(iterationRegionXPrefixProducts.x, iterationRegionYPrefixProducts.x, outputTexelsPerWGZ);
+		Parameters retval;
+		retval.perWG = perWG;
+		return retval;
 	}
+
+	SPerWorkgroup perWG;
+	// general settings
+	uint32_t lastChannel : 2;
+	uint32_t coverage : 1;
+	uint32_t unused : 29;
+	//! coverage settings
+	// required to compare the atomic count of passing pixels against, so we can get original coverage
+	uint32_t inPixelCount;
 };
 
 
diff --git a/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h b/include/nbl/builtin/hlsl/cpp_compat/intrinsics.h
@@ -66,6 +66,14 @@ NBL_BIT_OP_GLM_PASSTHROUGH(findLSB,findLSB)
 
 NBL_BIT_OP_GLM_PASSTHROUGH(findMSB,findMSB)
 
+// TODO: some of the functions in this header should move to `tgmath`
+template<typename T> requires ::nbl::hlsl::is_floating_point_v<T>
+inline T floor(const T& v)
+{
+    return glm::floor(v);
+}
+
+
 // inverse not defined cause its implemented via hidden friend
 template<typename T, uint16_t N, uint16_t M>
 inline matrix<T,N,M> inverse(const matrix<T,N,M>& m)
diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl
@@ -606,6 +606,8 @@ NBL_CONSTEXPR bool is_unsigned_v = is_unsigned<T>::value;
 template<class T>
 NBL_CONSTEXPR bool is_integral_v = is_integral<T>::value;
 template<class T>
+NBL_CONSTEXPR bool is_floating_point_v = is_floating_point<T>::value;
+template<class T>
 NBL_CONSTEXPR bool is_signed_v = is_signed<T>::value;
 template<class T>
 NBL_CONSTEXPR bool is_scalar_v = is_scalar<T>::value;
diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h
diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp

Original file line number	Diff line number	Diff line change
`@@ -77,6 +77,7 @@ struct compute_blit_t`
`77`	`77`	`uint16_t localInvocationIndex)`
`78`	`78`	`{`
`79`	`79`	`const float3 halfScale = scale * float3(0.5f, 0.5f, 0.5f);`
	`80`	`+ // bottom of the input tile`
`80`	`81`	`const uint32_t3 minOutputPixel = workGroupID * outputTexelsPerWG;`
`81`	`82`	`const float3 minOutputPixelCenterOfWG = float3(minOutputPixel)*scale + halfScale;`
`82`	`83`	`// this can be negative, in which case HW sampler takes care of wrapping for us`