#define SORT_SIZE 512 #if( SORT_SIZE>4096 ) // won't work for arrays>4096 #error due to LDS size SORT_SIZE must be 4096 or smaller #else #define ITEMS_PER_GROUP ( SORT_SIZE ) #endif #define HALF_SIZE (SORT_SIZE/2) #define ITERATIONS (HALF_SIZE > 1024 ? HALF_SIZE/1024 : 1) #define NUM_THREADS (HALF_SIZE/ITERATIONS) #define INVERSION (16*2 + 8*3) //-------------------------------------------------------------------------------------- // Structured Buffers //-------------------------------------------------------------------------------------- RWSTRUCTUREDBUFFER(counterBuffer, ParticleCounters, 4); RWSTRUCTUREDBUFFER(indexBuffer, uint, 2); RWSTRUCTUREDBUFFER(distanceBuffer, float, 6); #define NumElements counterBuffer[0].aliveCount_afterSimulation //-------------------------------------------------------------------------------------- // Bitonic Sort Compute Shader //-------------------------------------------------------------------------------------- groupshared float2 g_LDS[SORT_SIZE]; [numthreads(NUM_THREADS, 1, 1)] void main(uint3 Gid : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID, uint GI : SV_GroupIndex) { int GlobalBaseIndex = (Gid.x * SORT_SIZE) + GTid.x; int LocalBaseIndex = GI; uint numElementsInThreadGroup = min(SORT_SIZE, NumElements - (Gid.x * SORT_SIZE)); // Load shared data uint i; [unroll]for (i = 0; i<2 * ITERATIONS; ++i) { if (GI + i*NUM_THREADS < numElementsInThreadGroup) { uint loadIndex = GlobalBaseIndex + i*NUM_THREADS; g_LDS[LocalBaseIndex + i*NUM_THREADS] = float2(distanceBuffer[loadIndex], (float)indexBuffer[loadIndex]); } } GroupMemoryBarrierWithGroupSync(); // Bitonic sort for (unsigned int nMergeSize = 2; nMergeSize <= SORT_SIZE; nMergeSize = nMergeSize * 2) { for (uint nMergeSubSize = nMergeSize >> 1; nMergeSubSize>0; nMergeSubSize = nMergeSubSize >> 1) { [unroll]for (i = 0; i> 1 ? index_high + (2 * nMergeSubSize - 1) - index_low : index_high + nMergeSubSize + index_low; if (nSwapElem b.x) { g_LDS[index] = b; g_LDS[nSwapElem] = a; } } GroupMemoryBarrierWithGroupSync(); } } } // Store shared data [unroll]for (i = 0; i<2 * ITERATIONS; ++i) { if (GI + i*NUM_THREADS < numElementsInThreadGroup) { uint loadIndex = LocalBaseIndex + i*NUM_THREADS; uint storeIndex = GlobalBaseIndex + i*NUM_THREADS; distanceBuffer[storeIndex] = g_LDS[loadIndex].x; indexBuffer[storeIndex] = (uint)g_LDS[loadIndex].y; } } }