GaussianDownsample.compute 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. //
  2. // This is a modified version of the BlurCS compute shader from Microsoft's MiniEngine
  3. // library. The copyright notice from the original version is included below.
  4. //
  5. // The original source code of MiniEngine is available on GitHub.
  6. // https://github.com/Microsoft/DirectX-Graphics-Samples
  7. //
  8. //
  9. // Copyright (c) Microsoft. All rights reserved.
  10. // This code is licensed under the MIT License (MIT).
  11. // THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
  12. // ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
  13. // IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
  14. // PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
  15. //
  16. // Developed by Minigraph
  17. //
  18. // Author: Bob Brown
  19. //
  20. #pragma warning(disable : 3568)
  21. #pragma exclude_renderers gles gles3 d3d11_9x
  22. #include "../StdLib.hlsl"
  23. Texture2D<float4> _Source;
  24. RWTexture2D<float4> _Result;
  25. SamplerState sampler_LinearClamp;
  26. CBUFFER_START(cb)
  27. float4 _Size;
  28. CBUFFER_END
  29. // 16x16 pixels with an 8x8 center that we will be blurring writing out. Each uint is two color
  30. // channels packed together.
  31. // The reason for separating channels is to reduce bank conflicts in the local data memory
  32. // controller. A large stride will cause more threads to collide on the same memory bank.
  33. groupshared uint gs_cacheR[128];
  34. groupshared uint gs_cacheG[128];
  35. groupshared uint gs_cacheB[128];
  36. groupshared uint gs_cacheA[128];
  37. float4 BlurPixels(float4 a, float4 b, float4 c, float4 d, float4 e, float4 f, float4 g, float4 h, float4 i)
  38. {
  39. return 0.27343750 * (e )
  40. + 0.21875000 * (d + f)
  41. + 0.10937500 * (c + g)
  42. + 0.03125000 * (b + h)
  43. + 0.00390625 * (a + i);
  44. }
  45. void Store2Pixels(uint index, float4 pixel1, float4 pixel2)
  46. {
  47. gs_cacheR[index] = f32tof16(pixel1.r) | f32tof16(pixel2.r) << 16;
  48. gs_cacheG[index] = f32tof16(pixel1.g) | f32tof16(pixel2.g) << 16;
  49. gs_cacheB[index] = f32tof16(pixel1.b) | f32tof16(pixel2.b) << 16;
  50. gs_cacheA[index] = f32tof16(pixel1.a) | f32tof16(pixel2.a) << 16;
  51. }
  52. void Load2Pixels(uint index, out float4 pixel1, out float4 pixel2)
  53. {
  54. uint rr = gs_cacheR[index];
  55. uint gg = gs_cacheG[index];
  56. uint bb = gs_cacheB[index];
  57. uint aa = gs_cacheA[index];
  58. pixel1 = float4(f16tof32(rr ), f16tof32(gg ), f16tof32(bb ), f16tof32(aa ));
  59. pixel2 = float4(f16tof32(rr >> 16), f16tof32(gg >> 16), f16tof32(bb >> 16), f16tof32(aa >> 16));
  60. }
  61. void Store1Pixel(uint index, float4 pixel)
  62. {
  63. gs_cacheR[index] = asuint(pixel.r);
  64. gs_cacheG[index] = asuint(pixel.g);
  65. gs_cacheB[index] = asuint(pixel.b);
  66. gs_cacheA[index] = asuint(pixel.a);
  67. }
  68. void Load1Pixel(uint index, out float4 pixel)
  69. {
  70. pixel = asfloat(uint4(gs_cacheR[index], gs_cacheG[index], gs_cacheB[index], gs_cacheA[index]));
  71. }
  72. // Blur two pixels horizontally. This reduces LDS reads and pixel unpacking.
  73. void BlurHorizontally(uint outIndex, uint leftMostIndex)
  74. {
  75. float4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9;
  76. Load2Pixels(leftMostIndex + 0, s0, s1);
  77. Load2Pixels(leftMostIndex + 1, s2, s3);
  78. Load2Pixels(leftMostIndex + 2, s4, s5);
  79. Load2Pixels(leftMostIndex + 3, s6, s7);
  80. Load2Pixels(leftMostIndex + 4, s8, s9);
  81. Store1Pixel(outIndex , BlurPixels(s0, s1, s2, s3, s4, s5, s6, s7, s8));
  82. Store1Pixel(outIndex + 1, BlurPixels(s1, s2, s3, s4, s5, s6, s7, s8, s9));
  83. }
  84. void BlurVertically(uint2 pixelCoord, uint topMostIndex)
  85. {
  86. float4 s0, s1, s2, s3, s4, s5, s6, s7, s8;
  87. Load1Pixel(topMostIndex , s0);
  88. Load1Pixel(topMostIndex + 8, s1);
  89. Load1Pixel(topMostIndex + 16, s2);
  90. Load1Pixel(topMostIndex + 24, s3);
  91. Load1Pixel(topMostIndex + 32, s4);
  92. Load1Pixel(topMostIndex + 40, s5);
  93. Load1Pixel(topMostIndex + 48, s6);
  94. Load1Pixel(topMostIndex + 56, s7);
  95. Load1Pixel(topMostIndex + 64, s8);
  96. float4 blurred = BlurPixels(s0, s1, s2, s3, s4, s5, s6, s7, s8);
  97. // Write to the final target
  98. _Result[pixelCoord] = blurred;
  99. }
  100. #pragma kernel KMain
  101. #ifdef DISABLE_COMPUTE_SHADERS
  102. TRIVIAL_COMPUTE_KERNEL(KMain)
  103. #else
  104. [numthreads(8, 8, 1)]
  105. void KMain(uint2 groupId : SV_GroupID, uint2 groupThreadId : SV_GroupThreadID, uint2 dispatchThreadId : SV_DispatchThreadID)
  106. {
  107. // Upper-left pixel coordinate of quad that this thread will read
  108. int2 threadUL = (groupThreadId << 1) + (groupId << 3) - 4;
  109. // Downsample the block
  110. float2 offset = float2(threadUL);
  111. float4 p00 = _Source.SampleLevel(sampler_LinearClamp, (offset + 0.5) * _Size.zw, 0.0);
  112. float4 p10 = _Source.SampleLevel(sampler_LinearClamp, (offset + float2(1.0, 0.0) + 0.5) * _Size.zw, 0.0);
  113. float4 p01 = _Source.SampleLevel(sampler_LinearClamp, (offset + float2(0.0, 1.0) + 0.5) * _Size.zw, 0.0);
  114. float4 p11 = _Source.SampleLevel(sampler_LinearClamp, (offset + float2(1.0, 1.0) + 0.5) * _Size.zw, 0.0);
  115. // Store the 4 downsampled pixels in LDS
  116. uint destIdx = groupThreadId.x + (groupThreadId.y << 4u);
  117. Store2Pixels(destIdx , p00, p10);
  118. Store2Pixels(destIdx + 8u, p01, p11);
  119. GroupMemoryBarrierWithGroupSync();
  120. // Horizontally blur the pixels in LDS
  121. uint row = groupThreadId.y << 4u;
  122. BlurHorizontally(row + (groupThreadId.x << 1u), row + groupThreadId.x + (groupThreadId.x & 4u));
  123. GroupMemoryBarrierWithGroupSync();
  124. // Vertically blur the pixels in LDS and write the result to memory
  125. BlurVertically(dispatchThreadId, (groupThreadId.y << 3u) + groupThreadId.x);
  126. }
  127. #endif // DISABLE_COMPUTE_SHADERS