| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | #version 460 |
| | #extension GL_EXT_shader_8bit_storage : require |
| | #extension GL_EXT_shader_16bit_storage : require |
| | #extension GL_EXT_shader_explicit_arithmetic_types : require |
| | #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require |
| | #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require |
| | #extension GL_EXT_shader_explicit_arithmetic_types_float32 : require |
| | #extension GL_GOOGLE_include_directive : enable |
| | #extension GL_ARM_tensors : require |
| |
|
| | |
| | #include "typedefs.h" |
| | #include "common.h" |
| |
|
| | |
| |
|
| | struct TensorElement |
| | { |
| | int8_t4 wh_rgb_col_r; |
| | int8_t4 col_gb_dm_fback_r; |
| | int8_t4 fback_gba_ld; |
| | }; |
| |
|
| | |
| | layout (set=0, binding=0) uniform mediump sampler2D _ColourTex; |
| | layout (set=0, binding=1) uniform highp sampler2D _DepthTex; |
| | layout (set=0, binding=2) uniform mediump sampler2D _MotionVectorTex; |
| | layout (set=0, binding=3) uniform mediump sampler2D _HistoryTex; |
| | layout (set=0, binding=4) uniform lowp sampler2D _FeedbackTensor; |
| | layout (set=0, binding=5) uniform highp sampler2D _DepthTm1Tex; |
| | layout (set=0, binding=6) uniform lowp sampler2D _LumaDerivTm1Tex; |
| | layout (set=0, binding=7) uniform lowp sampler2D _NearestDepthCoordTm1Tex; |
| |
|
| | |
| | layout (set=1, binding=0) uniform writeonly tensorARM<int8_t, 4> _PreprocessTensor; |
| | layout (set=1, binding=1, rg8) uniform writeonly lowp image2D _PreProcessLumaDerivOut; |
| | layout (set=1, binding=3, r8) uniform writeonly lowp image2D _NearestDepthCoordOut; |
| |
|
| | |
| | layout(push_constant, std430) uniform PushConstants { |
| | |
| | layout(offset = 0) float4 _DeviceToViewDepth; |
| | layout(offset = 16) float4 _JitterOffset; |
| | layout(offset = 32) float4 _JitterOffsetTm1; |
| | layout(offset = 48) float4 _ScaleFactor; |
| |
|
| | |
| | layout(offset = 64) int32_t2 _OutputDims; |
| | layout(offset = 72) int32_t2 _InputDims; |
| | layout(offset = 80) float2 _InvOutputDims; |
| | layout(offset = 88) float2 _InvInputDims; |
| | layout(offset = 96) half4 _QuantParams; |
| | layout(offset = 104) half4 _MotionDisThreshPad; |
| |
|
| | |
| | layout(offset = 112) half2 _Exposure; |
| | layout(offset = 116) half2 _HistoryPad; |
| |
|
| | |
| | layout(offset = 120) int32_t2 _Padding; |
| |
|
| | |
| | }; |
| |
|
| | |
| | #define _Scale _ScaleFactor.xy |
| | #define _InvScale _ScaleFactor.zw |
| | #define _Exposure _Exposure.x |
| | #define _InvExposure _Exposure.y |
| | #define _JitterOffsetPix _JitterOffset.xy |
| | #define _JitterOffsetUv _JitterOffset.zw |
| | #define _JitterOffsetTm1Pix _JitterOffsetTm1.xy |
| | #define _JitterOffsetTm1Uv _JitterOffsetTm1.zw |
| | #define _MotionWarpThresh _MotionDisThreshPad.x |
| | #define _MotionDisThresh _MotionDisThreshPad.y |
| | #define _DisocclusionScale _MotionDisThreshPad.z |
| | #define _NotHistoryReset _HistoryPad.x |
| |
|
| | |
| | |
| | |
| |
|
| | #ifndef _InputQuantParams |
| | |
| | #define _InputQuantParams _QuantParams.xy |
| | #endif |
| | #ifndef _FeedbackQuantParams |
| | |
| | #define _FeedbackQuantParams _QuantParams.zw |
| | #endif |
| |
|
| | |
| |
|
| | #ifdef INVERTED_DEPTH |
| | #define MAX_DEPTH 0.f |
| | #else |
| | #define MAX_DEPTH 1.f |
| | #endif |
| |
|
| |
|
| | |
| |
|
| | bool IsOnScreen(int32_t2 pos, int32_t2 size) |
| | { |
| | return all(lessThan(uint32_t2(pos), uint32_t2(size))); |
| | } |
| |
|
| |
|
| | half2 LoadMotion(int32_t2 pixel) |
| | { |
| | return half2(texelFetch(_MotionVectorTex, pixel, 0).rg); |
| | } |
| |
|
| |
|
| | half3 LoadColour(int32_t2 pixel) |
| | { |
| | return Tonemap(SafeColour(half3(texelFetch(_ColourTex, pixel, 0).rgb) * _Exposure)); |
| | } |
| |
|
| |
|
| | int32_t2 LoadDepthNearestDepthOffsetTm1(int32_t2 pixel) |
| | { |
| | int32_t2 is_oob = int32_t2(IsOnScreen(pixel, _InputDims)); |
| | pixel = clamp(pixel, int32_t2(0), _InputDims - int32_t2(1)); |
| |
|
| | half encNorm = half(texelFetch(_NearestDepthCoordTm1Tex, pixel, 0).r); |
| | int32_t code = int32_t(encNorm * 255.0 + 0.5); |
| |
|
| | |
| | return DecodeNearestDepthCoord(code) * is_oob; |
| | } |
| |
|
| | void GatherReconstructedPreviousDepthRQuad(float2 fUV, inout float4 depthQuad) |
| | { |
| | int32_t2 offset = LoadDepthNearestDepthOffsetTm1(int32_t2(fUV * _InputDims)); |
| | float2 offset_uv = float2(offset) * _InvInputDims; |
| | depthQuad = textureGather(_DepthTm1Tex, fUV + offset_uv, 0).wzxy; |
| | } |
| |
|
| |
|
| | half3 WarpHistory(float2 uv) |
| | { |
| | return Tonemap(SafeColour(half3(textureLod(_HistoryTex, uv, 0).rgb) * _Exposure)); |
| | } |
| |
|
| |
|
| | half4 WarpFeedback(float2 uv) |
| | { |
| | return Dequantize(half4(textureLod(_FeedbackTensor, uv, 0)), _FeedbackQuantParams); |
| | } |
| |
|
| |
|
| | half2 WarpLumaDerivative(float2 uv) |
| | { |
| | return half2(textureLod(_LumaDerivTm1Tex, uv, 0).rg); |
| | } |
| |
|
| |
|
| | half2 CalculateLumaDerivative(float2 reproj_uv, half3 jittered_colour, half disocclusion_mask) |
| | { |
| | const half DIS_THRESH = 0.01HF; |
| | const half DERIV_MIN = 0.05HF; |
| | const half DERIV_MAX = 0.3HF; |
| | const half DERIV_POW = 1.5HF; |
| | const half DERIV_ALPHA = 0.1HF; |
| | const half DERIV_MAX_R = rcp(DERIV_MAX); |
| | const half DERIV_MAX_POW_R = rcp(pow(DERIV_MAX, DERIV_POW)); |
| |
|
| | |
| | |
| | |
| | half2 h = WarpLumaDerivative(reproj_uv); |
| | half luma_tm1 = h.y; |
| | half derivative_tm1 = h.x; |
| |
|
| | |
| | |
| | |
| | half luma_t = Luminance(jittered_colour); |
| | half derivative_t = abs(luma_t - luma_tm1); |
| |
|
| | |
| | |
| | |
| | |
| | |
| | half clipped = min(derivative_t, DERIV_MAX); |
| |
|
| | |
| | clipped *= step(DERIV_MIN, derivative_t); |
| |
|
| | |
| | |
| | half curved = clipped * sqrt(clipped) * DERIV_MAX_POW_R; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | half alpha_scale = mix(DERIV_ALPHA, |
| | DERIV_ALPHA * 0.1HF, |
| | clamp(derivative_tm1, 0.HF, DERIV_MAX) * DERIV_MAX_R); |
| |
|
| | half derivative = mix(derivative_tm1, curved, alpha_scale); |
| |
|
| | |
| | |
| | |
| | derivative *= step(disocclusion_mask, DIS_THRESH); |
| |
|
| | |
| | return half2(derivative, luma_t); |
| | } |
| |
|
| |
|
| | void FindNearestDepth(int32_t2 iPxPos, int32_t2 iPxSize, out float fNearestDepth, out int32_t2 fNearestDepthOffset) |
| | { |
| | |
| | |
| | |
| | |
| | |
| |
|
| | int32_t iSampleIndex = 0; |
| | const int32_t iSampleCount = 9; |
| | |
| | const int32_t2 iSampleOffsets[iSampleCount] = { |
| | int32_t2(+0, +0).yx, |
| | int32_t2(+1, +0).yx, |
| | int32_t2(+0, +1).yx, |
| | int32_t2(+0, -1).yx, |
| | int32_t2(-1, +0).yx, |
| | int32_t2(-1, +1).yx, |
| | int32_t2(+1, +1).yx, |
| | int32_t2(-1, -1).yx, |
| | int32_t2(+1, -1).yx, |
| | }; |
| |
|
| | |
| | float depth[9]; |
| | depth[0] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(+0, +0).yx).r); |
| | depth[1] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(+1, +0).yx).r); |
| | depth[2] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(+0, +1).yx).r); |
| | depth[3] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(+0, -1).yx).r); |
| | depth[4] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(-1, +0).yx).r); |
| | depth[5] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(-1, +1).yx).r); |
| | depth[6] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(+1, +1).yx).r); |
| | depth[7] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(-1, -1).yx).r); |
| | depth[8] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(+1, -1).yx).r); |
| |
|
| | |
| | fNearestDepth = depth[0]; |
| | fNearestDepthOffset = iSampleOffsets[0]; |
| | #pragma unroll |
| | for (iSampleIndex = 1; iSampleIndex < iSampleCount; ++iSampleIndex) { |
| |
|
| | int32_t2 iPos = iPxPos + iSampleOffsets[iSampleIndex]; |
| | if (IsOnScreen(iPos, iPxSize)) { |
| |
|
| | float fNdDepth = depth[iSampleIndex]; |
| | #ifdef INVERTED_DEPTH |
| | if (fNdDepth > fNearestDepth) { |
| | #else |
| | if (fNdDepth < fNearestDepth) { |
| | #endif |
| | fNearestDepth = fNdDepth; |
| | fNearestDepthOffset = iSampleOffsets[iSampleIndex]; |
| | } |
| | } |
| | } |
| | } |
| |
|
| |
|
| | int32_t2 RenderSize() |
| | { |
| | return int32_t2(_InputDims); |
| | } |
| |
|
| |
|
| | float2 ComputeNdc(float2 fPxPos, int32_t2 iSize) |
| | { |
| | |
| | |
| | |
| | |
| | |
| |
|
| | return fPxPos.yx / float2(iSize.yx) * float2(2.0f, -2.0f) + float2(-1.0f, 1.0f); |
| | } |
| |
|
| |
|
| | float GetViewSpaceDepth(float fDeviceDepth) |
| | { |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | const float4 fDeviceToViewDepth = _DeviceToViewDepth; |
| |
|
| | return (fDeviceToViewDepth[1] / (fDeviceDepth - fDeviceToViewDepth[0])); |
| | } |
| |
|
| |
|
| | float3 GetViewSpacePosition(int32_t2 iViewportPos, int32_t2 iViewportSize, float fDeviceDepth) |
| | { |
| | |
| | |
| | |
| | |
| | |
| |
|
| | const float4 fDeviceToViewDepth = _DeviceToViewDepth; |
| |
|
| | const float Z = GetViewSpaceDepth(fDeviceDepth); |
| |
|
| | const float2 fNdcPos = ComputeNdc(iViewportPos, iViewportSize); |
| | const float X = fDeviceToViewDepth[2] * fNdcPos.x * Z; |
| | const float Y = fDeviceToViewDepth[3] * fNdcPos.y * Z; |
| |
|
| | return float3(X, Y, Z); |
| | } |
| |
|
| |
|
| | struct BilinearSamplingData |
| | { |
| | int32_t2 iOffsets[4]; |
| | float fWeights[4]; |
| | int32_t2 iBasePos; |
| | float2 fQuadCenterUv; |
| | }; |
| |
|
| |
|
| | BilinearSamplingData GetBilinearSamplingData(float2 fUv, int32_t2 iSize) |
| | { |
| | |
| | |
| | |
| | |
| | |
| |
|
| | BilinearSamplingData data; |
| |
|
| | float2 fPxSample = (fUv * iSize) - float2(0.5f, 0.5f); |
| | data.iBasePos = int32_t2(floor(fPxSample)); |
| | data.fQuadCenterUv = (fPxSample + 0.5f) / float2(iSize); |
| | float2 fPxFrac = fract(fPxSample); |
| |
|
| | data.iOffsets[0] = int32_t2(0, 0); |
| | data.iOffsets[2] = int32_t2(1, 0); |
| | data.iOffsets[1] = int32_t2(0, 1); |
| | data.iOffsets[3] = int32_t2(1, 1); |
| |
|
| | data.fWeights[0] = (1.f - fPxFrac.x) * (1.f - fPxFrac.y); |
| | data.fWeights[1] = (fPxFrac.x) * (1.f - fPxFrac.y); |
| | data.fWeights[2] = (1.f - fPxFrac.x) * (fPxFrac.y); |
| | data.fWeights[3] = (fPxFrac.x) * (fPxFrac.y); |
| |
|
| | return data; |
| | } |
| |
|
| |
|
| | float ComputeDepthClip(float2 fUvSample, float fCurrentDepthSample) |
| | { |
| | |
| | |
| | |
| | |
| | |
| |
|
| | const float fReconstructedDepthBilinearWeightThreshold = 0.1f; |
| | float fCurrentDepthViewSpace = GetViewSpaceDepth(fCurrentDepthSample); |
| | BilinearSamplingData bilinearInfo = GetBilinearSamplingData(fUvSample, RenderSize()); |
| |
|
| | float fDepth = 0.0f; |
| | float fWeightSum = 0.0f; |
| |
|
| | float4 fPrevDepthSamples; |
| | GatherReconstructedPreviousDepthRQuad(bilinearInfo.fQuadCenterUv, fPrevDepthSamples); |
| | |
| | |
| |
|
| | for (int32_t iSampleIndex = 0; iSampleIndex < 4; iSampleIndex++) |
| | { |
| | const int32_t2 iOffset = bilinearInfo.iOffsets[iSampleIndex]; |
| | const int32_t2 iSamplePos = bilinearInfo.iBasePos + iOffset; |
| |
|
| | const float fWeight = bilinearInfo.fWeights[iSampleIndex]; |
| | const bool onscreen = IsOnScreen(iSamplePos, RenderSize()); |
| | fWeightSum += onscreen ? 0.f : fWeight; |
| | if (onscreen) |
| | { |
| | if (fWeight > fReconstructedDepthBilinearWeightThreshold) |
| | { |
| | const float fPrevDepthSample = fPrevDepthSamples[iSampleIndex]; |
| | const float fPrevNearestDepthViewSpace = GetViewSpaceDepth(fPrevDepthSample); |
| | const float fDepthDiff = fCurrentDepthViewSpace - fPrevNearestDepthViewSpace; |
| |
|
| | if (fDepthDiff > 0.0f) { |
| |
|
| | #ifdef INVERTED_DEPTH |
| | const float fPlaneDepth = min(fPrevDepthSample, fCurrentDepthSample); |
| | #else |
| | const float fPlaneDepth = max(fPrevDepthSample, fCurrentDepthSample); |
| | #endif |
| |
|
| | const float3 fCenter = GetViewSpacePosition(int32_t2(RenderSize() * 0.5f), RenderSize(), fPlaneDepth); |
| | const float3 fCorner = GetViewSpacePosition(int32_t2(0, 0), RenderSize(), fPlaneDepth); |
| |
|
| | const float fHalfViewportWidth = length(float2(RenderSize())); |
| | const float fDepthThreshold = max(fCurrentDepthViewSpace, fPrevNearestDepthViewSpace); |
| |
|
| | const float Ksep = 1.37e-05f; |
| | const float Kfov = length(fCorner) / length(fCenter); |
| | const float fRequiredDepthSeparation = Ksep * Kfov * fHalfViewportWidth * fDepthThreshold; |
| |
|
| | const float fResolutionFactor = saturate(length(float2(RenderSize())) / length(float2(1920.0f, 1080.0f))); |
| | const float fPower = lerp(1.0f, 3.0f, fResolutionFactor); |
| | fDepth += pow(saturate(float(fRequiredDepthSeparation / fDepthDiff)), fPower) * fWeight; |
| | fWeightSum += fWeight; |
| | } |
| | } |
| | } |
| | } |
| |
|
| | return (fWeightSum > 0) ? saturate(1.0f - fDepth / fWeightSum) : 0.0f; |
| | } |
| |
|
| |
|
| | void WriteLumaDerivative(int32_t2 pixel, half2 derivative) |
| | { |
| | imageStore(_PreProcessLumaDerivOut, pixel, half4(derivative, half2(0.f, 1.f))); |
| | } |
| |
|
| |
|
| | void WriteNearestDepthOffset(int32_t2 pixel, uint8_t offset) |
| | { |
| | half enc_norm = half(offset) / 255.HF; |
| | imageStore(_NearestDepthCoordOut, pixel, half4(enc_norm, 0.HF, 0.HF, 1.HF)); |
| | } |
| |
|
| |
|
| | void WriteToTensor(int32_t2 outputPixel, half3 input_colour, half3 history, half disocclusion_mask, half luma_derivative, half4 temporal_feedback) |
| | { |
| | TensorElement te; |
| | te.wh_rgb_col_r = Quantize(half4(history.rgb, input_colour.r), _InputQuantParams); |
| | te.col_gb_dm_fback_r = Quantize(half4(input_colour.gb, disocclusion_mask, temporal_feedback.r), _InputQuantParams); |
| | te.fback_gba_ld = Quantize(half4(temporal_feedback.gba, luma_derivative), _InputQuantParams); |
| | |
| | int8_t t0[12] = |
| | { |
| | te.wh_rgb_col_r.x, |
| | te.wh_rgb_col_r.y, |
| | te.wh_rgb_col_r.z, |
| | te.wh_rgb_col_r.w, |
| | te.col_gb_dm_fback_r.x, |
| | te.col_gb_dm_fback_r.y, |
| | te.col_gb_dm_fback_r.z, |
| | te.col_gb_dm_fback_r.w, |
| | te.fback_gba_ld.x, |
| | te.fback_gba_ld.y, |
| | te.fback_gba_ld.z, |
| | te.fback_gba_ld.w |
| | }; |
| | tensorWriteARM(_PreprocessTensor, uint[](0, outputPixel.y, outputPixel.x, 0), t0); |
| | } |
| |
|
| |
|
| | |
| | layout(local_size_x = 16, local_size_y = 16) in; |
| | void main() |
| | { |
| | int32_t2 input_pixel = int32_t2(gl_GlobalInvocationID.xy); |
| | if (any(greaterThanEqual(input_pixel, _InputDims))) return; |
| |
|
| | float2 uv = (float2(input_pixel) + 0.5f) * _InvInputDims; |
| |
|
| | |
| | |
| | |
| | float depth_dilated = float(0.f); |
| | int32_t2 nearest_pixel_offset = int32_t2(0); |
| | FindNearestDepth(input_pixel, RenderSize(), depth_dilated, nearest_pixel_offset); |
| |
|
| | |
| | |
| | |
| | half2 motion = LoadMotion(input_pixel + nearest_pixel_offset); |
| | |
| | |
| | half2 motion_pix = motion * half2(RenderSize()); |
| | motion *= half(dot(motion_pix, motion_pix) > _MotionWarpThresh); |
| |
|
| | |
| | float2 reproj_uv = uv - float2(motion); |
| | float2 unjitter_tm1_uv = reproj_uv - _JitterOffsetTm1Uv; |
| |
|
| | |
| | |
| | |
| | half disocclusion_mask = half(ComputeDepthClip(unjitter_tm1_uv, depth_dilated)); |
| |
|
| | |
| | |
| | half dm_scale = dot(motion_pix, motion_pix) > _MotionDisThresh ? half(1.0f) : _DisocclusionScale; |
| | disocclusion_mask = disocclusion_mask * dm_scale; |
| |
|
| | |
| | |
| | |
| | half3 warped_history = WarpHistory(reproj_uv); |
| |
|
| | |
| | |
| | |
| | half3 jittered_colour = LoadColour(input_pixel); |
| |
|
| | |
| | |
| | |
| | |
| | half2 luma_derivative = CalculateLumaDerivative(reproj_uv, jittered_colour, disocclusion_mask); |
| |
|
| | |
| | |
| | |
| | half4 temporal_feedback = WarpFeedback(reproj_uv); |
| |
|
| | |
| | |
| | |
| | uint8_t enc_depth_offset = EncodeNearestDepthCoord(nearest_pixel_offset); |
| |
|
| | |
| | |
| | |
| | |
| | WriteToTensor( |
| | input_pixel, |
| | jittered_colour, |
| | warped_history, |
| | disocclusion_mask, |
| | luma_derivative.x, |
| | temporal_feedback |
| | ); |
| | |
| | |
| | WriteNearestDepthOffset(input_pixel, enc_depth_offset); |
| | |
| | |
| | WriteLumaDerivative(input_pixel, luma_derivative); |
| | } |
| |
|