chrxh · Copilot · Dec 13, 2025 · Dec 13, 2025 · Dec 13, 2025 · Dec 14, 2025
diff --git a/source/EngineGpuKernels/EditKernels.cu b/source/EngineGpuKernels/EditKernels.cu
@@ -553,6 +553,154 @@ __global__ void cudaRolloutSelectionStep(SimulationData data, int* result)
     }
 }
 
+__global__ void cudaInitUnwrapSelection(SimulationData data)
+{
+    auto const cellPartition = calcAllThreadsPartition(data.objects.cells.getNumEntries());
+
+    for (int index = cellPartition.startIndex; index <= cellPartition.endIndex; ++index) {
+        auto const& cell = data.objects.cells.at(index);
+
+        // Initialize all selected cells for cluster-based unwrapping
+        // clusterIndex: used to identify connected components (initialized to self)
+        // tempValue.as_uint64: 0 = not yet unwrapped, 1 = unwrapped
+        // shared1: will hold the unwrapped position
+        // shared2: stores (distance, nearestCellIndex) for cluster heads
+        if (cell->selected == 1 || cell->selected == 2) {
+            cell->clusterIndex = index;
+            cell->tempValue.as_uint64 = 0;  // Not unwrapped yet
+            cell->shared1 = cell->pos;
+            // Use reinterpret_cast to store uint64 in shared2 (two floats = 8 bytes = 64 bits)
+            *reinterpret_cast<unsigned long long int*>(&cell->shared2) = 0xffffffffffffffffull;  // Max distance initially
+        }
+    }
+}
+
+__global__ void cudaFindUnwrapClusters(SimulationData data, int* result)
+{
+    auto const cellPartition = calcAllThreadsPartition(data.objects.cells.getNumEntries());
+
+    for (int index = cellPartition.startIndex; index <= cellPartition.endIndex; ++index) {
+        auto currentCell = data.objects.cells.at(index);
+
+        // Only process selected cells
+        if (currentCell->selected != 1 && currentCell->selected != 2) {
+            continue;
+        }
+
+        // Propagate minimum clusterIndex through connected selected cells (similar to ClusterProcessor)
+        for (int i = 0; i < 30; ++i) {
+            bool found = false;
+            for (int j = 0; j < currentCell->numConnections; ++j) {
+                auto candidateCell = currentCell->connections[j].cell;
+                // Only consider connected selected cells
+                if (candidateCell->selected != 1 && candidateCell->selected != 2) {
+                    continue;
+                }
+                auto cellTag = currentCell->clusterIndex;
+                auto origTag = atomicMin(&candidateCell->clusterIndex, cellTag);
+                if (cellTag < origTag) {
+                    currentCell = candidateCell;
+                    found = true;
+                    atomicExch(result, 1);
+                    break;
+                }
+            }
+            if (!found) {
+                break;
+            }
+        }
+    }
+}
+
+__global__ void cudaFindNearestInCluster(SimulationData data, float2 refPos)
+{
+    auto const cellPartition = calcAllThreadsPartition(data.objects.cells.getNumEntries());
+
+    for (int index = cellPartition.startIndex; index <= cellPartition.endIndex; ++index) {
+        auto const& cell = data.objects.cells.at(index);
+
+        // Only process selected cells
+        if (cell->selected != 1 && cell->selected != 2) {
+            continue;
+        }
+
+        // Calculate distance to refPos (using periodic boundary)
+        auto distance = data.cellMap.getDistance(refPos, cell->pos);
+
+        // Encode distance and index into 64-bit value (upper 32 bits: distance * 1000, lower 32 bits: index)
+        auto distanceInt = static_cast<unsigned long long int>(distance * 1000.0f);
+        auto encodedValue = (distanceInt << 32) | static_cast<unsigned long long int>(index);
+
+        // Get the cluster head cell and atomically update if this cell is closer
+        // We store the encoded value in shared2 (reinterpreted as uint64)
+        auto clusterHead = data.objects.cells.at(cell->clusterIndex);
+        atomicMin(reinterpret_cast<unsigned long long int*>(&clusterHead->shared2), encodedValue);
+    }
+}
+
+__global__ void cudaMarkUnwrapStartCells(SimulationData data, float2 refPos)
+{
+    auto const cellPartition = calcAllThreadsPartition(data.objects.cells.getNumEntries());
+
+    for (int index = cellPartition.startIndex; index <= cellPartition.endIndex; ++index) {
+        auto const& cell = data.objects.cells.at(index);
+
+        // Only process selected cells that are cluster heads (clusterIndex == self)
+        if ((cell->selected != 1 && cell->selected != 2) || cell->clusterIndex != static_cast<uint32_t>(index)) {
+            continue;
+        }
+
+        // This is a cluster head - get the nearest cell index from shared2
+        auto encodedValue = *reinterpret_cast<unsigned long long int*>(&cell->shared2);
+        auto nearestCellIndex = static_cast<int>(encodedValue & 0xffffffff);
+
+        // Mark the nearest cell as unwrapped with position relative to refPos
+        auto nearestCell = data.objects.cells.at(nearestCellIndex);
+        nearestCell->tempValue.as_uint64 = 1;  // Mark as unwrapped
+        nearestCell->shared1 = nearestCell->pos + data.cellMap.getCorrectionIncrement(refPos, nearestCell->pos);
+    }
+}
+
+__global__ void cudaUnwrapSelectionStep(SimulationData data, int* result)
+{
+    auto const cellPartition = calcAllThreadsPartition(data.objects.cells.getNumEntries());
+
+    for (int index = cellPartition.startIndex; index <= cellPartition.endIndex; ++index) {
+        auto const& cell = data.objects.cells.at(index);
+
+        // Only process selected cells that have already been unwrapped
+        if ((cell->selected == 1 || cell->selected == 2) && cell->tempValue.as_uint64 == 1) {
+            auto currentCell = cell;
+
+            // Propagate unwrapping to connected cells (heuristics similar to cudaRolloutSelectionStep)
+            for (int i = 0; i < 30; ++i) {
+                bool found = false;
+                for (int j = 0; j < currentCell->numConnections; ++j) {
+                    auto connectedCell = currentCell->connections[j].cell;
+                    // Check if connected cell is selected and not yet unwrapped
+                    if ((connectedCell->selected == 1 || connectedCell->selected == 2) && connectedCell->tempValue.as_uint64 == 0) {
+                        // Atomically mark as unwrapped to prevent race conditions
+                        auto oldValue = atomicCAS(reinterpret_cast<unsigned long long int*>(&connectedCell->tempValue.as_uint64), 0ull, 1ull);
+                        if (oldValue == 0) {
+                            // Calculate the unwrapped position based on the current cell's unwrapped position
+                            auto delta = connectedCell->pos - currentCell->pos;
+                            data.cellMap.correctDirection(delta);
+                            connectedCell->shared1 = currentCell->shared1 + delta;
+                            currentCell = connectedCell;
+                            found = true;
+                            atomicExch(result, 1);
+                            break;
+                        }
+                    }
+                }
+                if (!found) {
+                    break;
+                }
+            }
+        }
+    }
+}
+
 __global__ void cudaApplyForce(SimulationData data, ApplyForceData applyData)
 {
     {

diff --git a/source/EngineGpuKernels/EditKernels.cuh b/source/EngineGpuKernels/EditKernels.cuh
@@ -48,6 +48,11 @@ __global__ void cudaSetSelection(AreaSelectionData selectionData, SimulationData
 __global__ void cudaRemoveSelection(SimulationData data, bool onlyClusterSelection);
 __global__ void cudaSwapSelection(float2 pos, float radius, SimulationData data);
 __global__ void cudaRolloutSelectionStep(SimulationData data, int* result);
+__global__ void cudaInitUnwrapSelection(SimulationData data);
+__global__ void cudaFindUnwrapClusters(SimulationData data, int* result);
+__global__ void cudaFindNearestInCluster(SimulationData data, float2 refPos);
+__global__ void cudaMarkUnwrapStartCells(SimulationData data, float2 refPos);
+__global__ void cudaUnwrapSelectionStep(SimulationData data, int* result);
 __global__ void cudaApplyForce(SimulationData data, ApplyForceData applyData);
 __global__ void cudaResetSelectionResult(SelectionResult result);
 __global__ void cudaCalcCellWithMinimalPosY(SimulationData data, unsigned long long int* minCellPosYAndIndex);

diff --git a/source/EngineGpuKernels/EditKernelsService.cu b/source/EngineGpuKernels/EditKernelsService.cu
@@ -8,6 +8,7 @@ _EditKernelsService::_EditKernelsService()
 {
     auto& memoryManager = CudaMemoryManager::getInstance();
     memoryManager.acquireMemory(1, _cudaRolloutResult);
+    memoryManager.acquireMemory(1, _cudaUnwrapResult);
     memoryManager.acquireMemory(1, _cudaSwitchResult);
     memoryManager.acquireMemory(1, _cudaUpdateResult);
     memoryManager.acquireMemory(1, _cudaRemoveResult);
@@ -26,6 +27,7 @@ _EditKernelsService::~_EditKernelsService()
 {
     auto& memoryManager = CudaMemoryManager::getInstance();
     memoryManager.freeMemory(_cudaRolloutResult);
+    memoryManager.freeMemory(_cudaUnwrapResult);
     memoryManager.freeMemory(_cudaSwitchResult);
     memoryManager.freeMemory(_cudaUpdateResult);
     memoryManager.freeMemory(_cudaRemoveResult);
@@ -294,6 +296,35 @@ void _EditKernelsService::rolloutSelection(CudaSettings const& gpuSettings, Simu
     } while (1 == copyToHost(_cudaRolloutResult));
 }
 
+void _EditKernelsService::unwrapSelection(CudaSettings const& gpuSettings, SimulationData const& data, float2 const& refPos)
+{
+    // Step 1: Initialize all selected cells (clusterIndex, tempValue, shared1, shared2)
+    KERNEL_CALL(cudaInitUnwrapSelection, data);
+    cudaDeviceSynchronize();
+
+    // Step 2: Find connected components among selected cells using cluster propagation
+    do {
+        setValueToDevice(_cudaUnwrapResult, 0);
+        KERNEL_CALL(cudaFindUnwrapClusters, data, _cudaUnwrapResult);
+        cudaDeviceSynchronize();
+    } while (1 == copyToHost(_cudaUnwrapResult));
+
+    // Step 3: Find the nearest cell to refPos in each cluster
+    KERNEL_CALL(cudaFindNearestInCluster, data, refPos);
+    cudaDeviceSynchronize();
+
+    // Step 4: Mark the starting cells (nearest in each cluster) as unwrapped
+    KERNEL_CALL(cudaMarkUnwrapStartCells, data, refPos);
+    cudaDeviceSynchronize();
+
+    // Step 5: Propagate unwrapping through connected cells in all components
+    do {
+        setValueToDevice(_cudaUnwrapResult, 0);
+        KERNEL_CALL(cudaUnwrapSelectionStep, data, _cudaUnwrapResult);
+        cudaDeviceSynchronize();
+    } while (1 == copyToHost(_cudaUnwrapResult));
+}
+
 void _EditKernelsService::applyCataclysm(CudaSettings const& gpuSettings, SimulationData const& data)
 {
     KERNEL_CALL(cudaApplyCataclysm, data);

diff --git a/source/EngineGpuKernels/EditKernelsService.cuh b/source/EngineGpuKernels/EditKernelsService.cuh
@@ -35,6 +35,7 @@ public:
     void applyForce(CudaSettings const& gpuSettings, SimulationData const& data, ApplyForceData const& applyData);
 
     void rolloutSelection(CudaSettings const& gpuSettings, SimulationData const& data);
+    void unwrapSelection(CudaSettings const& gpuSettings, SimulationData const& data, float2 const& refPos);
 
     void applyCataclysm(CudaSettings const& gpuSettings, SimulationData const& data);
 
@@ -43,6 +44,7 @@ private:
 
     // Gpu memory
     int* _cudaRolloutResult;
+    int* _cudaUnwrapResult;
     int* _cudaSwitchResult;
     int* _cudaUpdateResult;
     int* _cudaRemoveResult;