if (SORTING) { device->EventBegin("SortEmittedParticles", threadID); // initialize sorting arguments: device->BindCS(kickoffSortCS, threadID); device->Dispatch(1, 1, 1, threadID); // initial sorting: bool bDone = true; // calculate how many threads we'll require: // we'll sort 512 elements per CU (threadgroupsize 256) // maybe need to optimize this or make it changeable during init // TGS=256 is a good intermediate value unsigned int numThreadGroups = ((MAX_PARTICLES - 1) >> 9) + 1; if (numThreadGroups>1) bDone = false; // sort all buffers of size 512 (and presort bigger ones) device->BindCS(sortCS, threadID); device->DispatchIndirect(indirectBuffers, ARGUMENTBUFFER_OFFSET_DISPATCHSORT, threadID); int presorted = 512; while (!bDone) { bDone = true; device->BindCS(sortStepCS, threadID); // prepare thread group description data unsigned int numThreadGroups = 0; if (MAX_PARTICLES > (uint32_t)presorted) { if (MAX_PARTICLES>(uint32_t)presorted * 2) bDone = false; unsigned int pow2 = presorted; while (pow2> 9; } unsigned int nMergeSize = presorted * 2; for (unsigned int nMergeSubSize = nMergeSize >> 1; nMergeSubSize>256; nMergeSubSize = nMergeSubSize >> 1) // for( int nMergeSubSize=nMergeSize>>1; nMergeSubSize>0; nMergeSubSize=nMergeSubSize>>1 ) { SortConstants sc; sc.job_params.x = nMergeSubSize; if (nMergeSubSize == nMergeSize >> 1) { sc.job_params.y = (2 * nMergeSubSize - 1); sc.job_params.z = -1; } else { sc.job_params.y = nMergeSubSize; sc.job_params.z = 1; } sc.job_params.w = 0; device->UpdateBuffer(sortCB, &sc, threadID); device->Dispatch(numThreadGroups, 1, 1, threadID); } device->BindCS(sortInnerCS, threadID); device->Dispatch(numThreadGroups, 1, 1, threadID); presorted *= 2; }