Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 148 additions & 0 deletions source/EngineGpuKernels/EditKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,154 @@ __global__ void cudaRolloutSelectionStep(SimulationData data, int* result)
}
}

__global__ void cudaInitUnwrapSelection(SimulationData data)
{
auto const cellPartition = calcAllThreadsPartition(data.objects.cells.getNumEntries());

for (int index = cellPartition.startIndex; index <= cellPartition.endIndex; ++index) {
auto const& cell = data.objects.cells.at(index);

// Initialize all selected cells for cluster-based unwrapping
// clusterIndex: used to identify connected components (initialized to self)
// tempValue.as_uint64: 0 = not yet unwrapped, 1 = unwrapped
// shared1: will hold the unwrapped position
// shared2: stores (distance, nearestCellIndex) for cluster heads
if (cell->selected == 1 || cell->selected == 2) {
cell->clusterIndex = index;
cell->tempValue.as_uint64 = 0; // Not unwrapped yet
cell->shared1 = cell->pos;
// Use reinterpret_cast to store uint64 in shared2 (two floats = 8 bytes = 64 bits)
*reinterpret_cast<unsigned long long int*>(&cell->shared2) = 0xffffffffffffffffull; // Max distance initially
}
}
}

__global__ void cudaFindUnwrapClusters(SimulationData data, int* result)
{
auto const cellPartition = calcAllThreadsPartition(data.objects.cells.getNumEntries());

for (int index = cellPartition.startIndex; index <= cellPartition.endIndex; ++index) {
auto currentCell = data.objects.cells.at(index);

// Only process selected cells
if (currentCell->selected != 1 && currentCell->selected != 2) {
continue;
}

// Propagate minimum clusterIndex through connected selected cells (similar to ClusterProcessor)
for (int i = 0; i < 30; ++i) {
bool found = false;
for (int j = 0; j < currentCell->numConnections; ++j) {
auto candidateCell = currentCell->connections[j].cell;
// Only consider connected selected cells
if (candidateCell->selected != 1 && candidateCell->selected != 2) {
continue;
}
auto cellTag = currentCell->clusterIndex;
auto origTag = atomicMin(&candidateCell->clusterIndex, cellTag);
if (cellTag < origTag) {
currentCell = candidateCell;
found = true;
atomicExch(result, 1);
break;
}
}
if (!found) {
break;
}
}
}
}

__global__ void cudaFindNearestInCluster(SimulationData data, float2 refPos)
{
auto const cellPartition = calcAllThreadsPartition(data.objects.cells.getNumEntries());

for (int index = cellPartition.startIndex; index <= cellPartition.endIndex; ++index) {
auto const& cell = data.objects.cells.at(index);

// Only process selected cells
if (cell->selected != 1 && cell->selected != 2) {
continue;
}

// Calculate distance to refPos (using periodic boundary)
auto distance = data.cellMap.getDistance(refPos, cell->pos);

// Encode distance and index into 64-bit value (upper 32 bits: distance * 1000, lower 32 bits: index)
auto distanceInt = static_cast<unsigned long long int>(distance * 1000.0f);
auto encodedValue = (distanceInt << 32) | static_cast<unsigned long long int>(index);

// Get the cluster head cell and atomically update if this cell is closer
// We store the encoded value in shared2 (reinterpreted as uint64)
auto clusterHead = data.objects.cells.at(cell->clusterIndex);
atomicMin(reinterpret_cast<unsigned long long int*>(&clusterHead->shared2), encodedValue);
}
}

__global__ void cudaMarkUnwrapStartCells(SimulationData data, float2 refPos)
{
auto const cellPartition = calcAllThreadsPartition(data.objects.cells.getNumEntries());

for (int index = cellPartition.startIndex; index <= cellPartition.endIndex; ++index) {
auto const& cell = data.objects.cells.at(index);

// Only process selected cells that are cluster heads (clusterIndex == self)
if ((cell->selected != 1 && cell->selected != 2) || cell->clusterIndex != static_cast<uint32_t>(index)) {
continue;
}

// This is a cluster head - get the nearest cell index from shared2
auto encodedValue = *reinterpret_cast<unsigned long long int*>(&cell->shared2);
auto nearestCellIndex = static_cast<int>(encodedValue & 0xffffffff);

// Mark the nearest cell as unwrapped with position relative to refPos
auto nearestCell = data.objects.cells.at(nearestCellIndex);
nearestCell->tempValue.as_uint64 = 1; // Mark as unwrapped
nearestCell->shared1 = nearestCell->pos + data.cellMap.getCorrectionIncrement(refPos, nearestCell->pos);
}
}

__global__ void cudaUnwrapSelectionStep(SimulationData data, int* result)
{
auto const cellPartition = calcAllThreadsPartition(data.objects.cells.getNumEntries());

for (int index = cellPartition.startIndex; index <= cellPartition.endIndex; ++index) {
auto const& cell = data.objects.cells.at(index);

// Only process selected cells that have already been unwrapped
if ((cell->selected == 1 || cell->selected == 2) && cell->tempValue.as_uint64 == 1) {
auto currentCell = cell;

// Propagate unwrapping to connected cells (heuristics similar to cudaRolloutSelectionStep)
for (int i = 0; i < 30; ++i) {
bool found = false;
for (int j = 0; j < currentCell->numConnections; ++j) {
auto connectedCell = currentCell->connections[j].cell;
// Check if connected cell is selected and not yet unwrapped
if ((connectedCell->selected == 1 || connectedCell->selected == 2) && connectedCell->tempValue.as_uint64 == 0) {
// Atomically mark as unwrapped to prevent race conditions
auto oldValue = atomicCAS(reinterpret_cast<unsigned long long int*>(&connectedCell->tempValue.as_uint64), 0ull, 1ull);
if (oldValue == 0) {
// Calculate the unwrapped position based on the current cell's unwrapped position
auto delta = connectedCell->pos - currentCell->pos;
data.cellMap.correctDirection(delta);
connectedCell->shared1 = currentCell->shared1 + delta;
currentCell = connectedCell;
found = true;
atomicExch(result, 1);
break;
}
}
}
if (!found) {
break;
}
}
}
}
}

__global__ void cudaApplyForce(SimulationData data, ApplyForceData applyData)
{
{
Expand Down
5 changes: 5 additions & 0 deletions source/EngineGpuKernels/EditKernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ __global__ void cudaSetSelection(AreaSelectionData selectionData, SimulationData
__global__ void cudaRemoveSelection(SimulationData data, bool onlyClusterSelection);
__global__ void cudaSwapSelection(float2 pos, float radius, SimulationData data);
__global__ void cudaRolloutSelectionStep(SimulationData data, int* result);
__global__ void cudaInitUnwrapSelection(SimulationData data);
__global__ void cudaFindUnwrapClusters(SimulationData data, int* result);
__global__ void cudaFindNearestInCluster(SimulationData data, float2 refPos);
__global__ void cudaMarkUnwrapStartCells(SimulationData data, float2 refPos);
__global__ void cudaUnwrapSelectionStep(SimulationData data, int* result);
__global__ void cudaApplyForce(SimulationData data, ApplyForceData applyData);
__global__ void cudaResetSelectionResult(SelectionResult result);
__global__ void cudaCalcCellWithMinimalPosY(SimulationData data, unsigned long long int* minCellPosYAndIndex);
Expand Down
31 changes: 31 additions & 0 deletions source/EngineGpuKernels/EditKernelsService.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ _EditKernelsService::_EditKernelsService()
{
auto& memoryManager = CudaMemoryManager::getInstance();
memoryManager.acquireMemory(1, _cudaRolloutResult);
memoryManager.acquireMemory(1, _cudaUnwrapResult);
memoryManager.acquireMemory(1, _cudaSwitchResult);
memoryManager.acquireMemory(1, _cudaUpdateResult);
memoryManager.acquireMemory(1, _cudaRemoveResult);
Expand All @@ -26,6 +27,7 @@ _EditKernelsService::~_EditKernelsService()
{
auto& memoryManager = CudaMemoryManager::getInstance();
memoryManager.freeMemory(_cudaRolloutResult);
memoryManager.freeMemory(_cudaUnwrapResult);
memoryManager.freeMemory(_cudaSwitchResult);
memoryManager.freeMemory(_cudaUpdateResult);
memoryManager.freeMemory(_cudaRemoveResult);
Expand Down Expand Up @@ -294,6 +296,35 @@ void _EditKernelsService::rolloutSelection(CudaSettings const& gpuSettings, Simu
} while (1 == copyToHost(_cudaRolloutResult));
}

void _EditKernelsService::unwrapSelection(CudaSettings const& gpuSettings, SimulationData const& data, float2 const& refPos)
{
// Step 1: Initialize all selected cells (clusterIndex, tempValue, shared1, shared2)
KERNEL_CALL(cudaInitUnwrapSelection, data);
cudaDeviceSynchronize();

// Step 2: Find connected components among selected cells using cluster propagation
do {
setValueToDevice(_cudaUnwrapResult, 0);
KERNEL_CALL(cudaFindUnwrapClusters, data, _cudaUnwrapResult);
cudaDeviceSynchronize();
} while (1 == copyToHost(_cudaUnwrapResult));

// Step 3: Find the nearest cell to refPos in each cluster
KERNEL_CALL(cudaFindNearestInCluster, data, refPos);
cudaDeviceSynchronize();

// Step 4: Mark the starting cells (nearest in each cluster) as unwrapped
KERNEL_CALL(cudaMarkUnwrapStartCells, data, refPos);
cudaDeviceSynchronize();

// Step 5: Propagate unwrapping through connected cells in all components
do {
setValueToDevice(_cudaUnwrapResult, 0);
KERNEL_CALL(cudaUnwrapSelectionStep, data, _cudaUnwrapResult);
cudaDeviceSynchronize();
} while (1 == copyToHost(_cudaUnwrapResult));
}

void _EditKernelsService::applyCataclysm(CudaSettings const& gpuSettings, SimulationData const& data)
{
KERNEL_CALL(cudaApplyCataclysm, data);
Expand Down
2 changes: 2 additions & 0 deletions source/EngineGpuKernels/EditKernelsService.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public:
void applyForce(CudaSettings const& gpuSettings, SimulationData const& data, ApplyForceData const& applyData);

void rolloutSelection(CudaSettings const& gpuSettings, SimulationData const& data);
void unwrapSelection(CudaSettings const& gpuSettings, SimulationData const& data, float2 const& refPos);

void applyCataclysm(CudaSettings const& gpuSettings, SimulationData const& data);

Expand All @@ -43,6 +44,7 @@ private:

// Gpu memory
int* _cudaRolloutResult;
int* _cudaUnwrapResult;
int* _cudaSwitchResult;
int* _cudaUpdateResult;
int* _cudaRemoveResult;
Expand Down