SSL-A-Team · barulicm · Sep 1, 2025 · Aug 31, 2025 · Aug 31, 2025 · Aug 31, 2025
diff --git a/ateam_spatial/CMakeLists.txt b/ateam_spatial/CMakeLists.txt
@@ -36,6 +36,9 @@ set_target_properties(${PROJECT_NAME} PROPERTIES
   CUDA_ARCHITECTURES native
 )
 target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--maxrregcount 62>)
+if(ATEAM_SPATIAL_CUDA_DEBUG)
+  target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-g -G>)
+endif()
 if(ATEAM_SPATIAL_CUDA_COMPILE_STATS)
   target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas="-v">)
 endif()

diff --git a/ateam_spatial/include/ateam_spatial/min_max_loc_kernel.hpp b/ateam_spatial/include/ateam_spatial/min_max_loc_kernel.hpp
@@ -72,28 +72,30 @@ __global__ void max_loc_kernel(const float * buffer, const uint32_t buffer_size,
 {
   const auto idx = blockDim.x * blockIdx.x + threadIdx.x;
   __shared__ uint32_t shared_index_data[blockSize];
-  if (idx < buffer_size){
+  shared_index_data[threadIdx.x] = idx;
+  __syncthreads();
 
-    /*copy to shared memory*/
-    shared_index_data[threadIdx.x] = idx;
-    __syncthreads();
-
-    for(int stride=1; stride < blockDim.x; stride *= 2) {
+  for(int stride=1; stride < blockDim.x; stride *= 2) {
+    if (idx < buffer_size) {
       if (threadIdx.x % (2*stride) == 0) {
         const auto lhs_block_index = threadIdx.x;
         const auto rhs_block_index = threadIdx.x + stride;
         const auto lhs_index = shared_index_data[lhs_block_index];
         const auto rhs_index = shared_index_data[rhs_block_index];
-        const float lhs_value = buffer[lhs_index];
-        const float rhs_value = buffer[rhs_index];
-        if(lhs_value > rhs_value) {
-          shared_index_data[lhs_block_index] = lhs_index;
+        if (rhs_block_index < buffer_size) {
+          const float lhs_value = buffer[lhs_index];
+          const float rhs_value = buffer[rhs_index];
+          if(lhs_value > rhs_value) {
+            shared_index_data[lhs_block_index] = lhs_index;
+          } else {
+            shared_index_data[lhs_block_index] = rhs_index;
+          }
         } else {
-          shared_index_data[lhs_block_index] = rhs_index;
+          shared_index_data[lhs_block_index] = lhs_index;
         }
       }
-      __syncthreads();
     }
+    __syncthreads();
   }
   if (threadIdx.x == 0) {
     atomicMax(loc_out, shared_index_data[0], buffer);
@@ -105,28 +107,30 @@ __global__ void min_loc_kernel(const float * buffer, const uint32_t buffer_size,
 {
   const auto idx = blockDim.x * blockIdx.x + threadIdx.x;
   __shared__ uint32_t shared_index_data[blockSize];
-  if (idx < buffer_size){
+  shared_index_data[threadIdx.x] = idx;
+  __syncthreads();
 
-    /*copy to shared memory*/
-    shared_index_data[threadIdx.x] = idx;
-    __syncthreads();
-
-    for(int stride=1; stride < blockDim.x; stride *= 2) {
+  for(int stride=1; stride < blockDim.x; stride *= 2) {
+    if (idx < buffer_size) {
       if (threadIdx.x % (2*stride) == 0) {
         const auto lhs_block_index = threadIdx.x;
         const auto rhs_block_index = threadIdx.x + stride;
         const auto lhs_index = shared_index_data[lhs_block_index];
         const auto rhs_index = shared_index_data[rhs_block_index];
-        const float lhs_value = buffer[lhs_index];
-        const float rhs_value = buffer[rhs_index];
-        if(lhs_value < rhs_value) {
-          shared_index_data[lhs_block_index] = lhs_index;
+        if (rhs_block_index < buffer_size) {
+          const float lhs_value = buffer[lhs_index];
+          const float rhs_value = buffer[rhs_index];
+          if(lhs_value < rhs_value) {
+            shared_index_data[lhs_block_index] = lhs_index;
+          } else {
+            shared_index_data[lhs_block_index] = rhs_index;
+          }
         } else {
-          shared_index_data[lhs_block_index] = rhs_index;
+          shared_index_data[lhs_block_index] = lhs_index;
         }
       }
-      __syncthreads();
     }
+    __syncthreads();
   }
   if (threadIdx.x == 0) {
     atomicMin(loc_out, shared_index_data[0], buffer);

diff --git a/ateam_spatial/src/layers/distance_down_field.cu b/ateam_spatial/src/layers/distance_down_field.cu
@@ -5,7 +5,7 @@ namespace ateam_spatial::layers
 {
 
 CUDA_HOSTDEV float DistanceDownField(const float x, const FieldDimensions & field_dims, const SpatialSettings & settings) {
-  return x + (WorldWidthReal(field_dims) / 2.0);
+  return x + (WorldWidthReal(field_dims) / 2.0f);
 }
 
 } // namespace ateam_spatial::layers
diff --git a/ateam_spatial/src/layers/distance_from_field_edge.cu b/ateam_spatial/src/layers/distance_from_field_edge.cu
@@ -7,8 +7,8 @@ namespace ateam_spatial::layers
 CUDA_HOSTDEV float DistanceFromFieldEdge(const float x, const float y, const FieldDimensions & field_dims, const SpatialSettings & settings) {
   const auto x_real = x;
   const auto y_real = y;
-  const auto half_world_width = WorldWidthReal(field_dims) / 2.0;
-  const auto half_world_height = WorldHeightReal(field_dims) / 2.0;
+  const auto half_world_width = WorldWidthReal(field_dims) / 2.0f;
+  const auto half_world_height = WorldHeightReal(field_dims) / 2.0f;
   const auto half_world_diff = half_world_width - half_world_height;
 
   if(fabs(x_real) < half_world_diff || (fabs(x_real) - half_world_diff) < fabs(y_real)) {

diff --git a/ateam_spatial/src/layers/in_field.cu b/ateam_spatial/src/layers/in_field.cu
@@ -5,10 +5,10 @@ namespace ateam_spatial::layers
 {
 
 CUDA_HOSTDEV float InField(const float x, const float y, const FieldDimensions & field_dims, const SpatialSettings & settings) {
-  const auto pos_x_thresh = field_dims.field_length / 2.0;
-  const auto neg_x_thresh = -field_dims.field_length / 2.0;
-  const auto pos_y_thresh = field_dims.field_width / 2.0;
-  const auto neg_y_thresh = -field_dims.field_width / 2.0;
+  const auto pos_x_thresh = field_dims.field_length / 2.0f;
+  const auto neg_x_thresh = -field_dims.field_length / 2.0f;
+  const auto pos_y_thresh = field_dims.field_width / 2.0f;
+  const auto neg_y_thresh = -field_dims.field_width / 2.0f;
 
   return (x <= pos_x_thresh) && (x >= neg_x_thresh) && (y <= pos_y_thresh) && (y >= neg_y_thresh);
 }

diff --git a/ateam_spatial/src/layers/line_of_sight.cu b/ateam_spatial/src/layers/line_of_sight.cu
@@ -11,18 +11,18 @@ CUDA_HOSTDEV float PointDistance(const float x_1, const float y_1, const float x
 CUDA_HOSTDEV float DistanceToSegment(const float line_1_x, const float line_1_y, const float line_2_x, const float line_2_y, const float point_x, const float point_y)
 {
   const auto l2 = PointDistance(line_1_x, line_1_y, line_2_x, line_2_y);
-  if (fabsf(l2) < 1e-8) {
+  if (fabsf(l2) < 1e-8f) {
     return PointDistance(line_1_x, line_1_y, point_x, point_y);
   }
   auto t = (((point_x - line_1_x) * (line_2_x - line_1_x)) + ((point_y - line_1_y)*(line_2_y - line_1_y))) / (l2*l2);
-  t = max(0.0, min(1.0, t));
+  t = max(0.0f, min(1.0f, t));
   const auto projected_x = line_1_x + (t * (line_2_x - line_1_x));
   const auto projected_y = line_1_y + (t * (line_2_y - line_1_y));
   return PointDistance(point_x, point_y, projected_x, projected_y);
 }
 
 CUDA_HOSTDEV float LineOfSight(const float src_x, const float src_y, const float dst_x, const float dst_y, const Robot * their_bots, const FieldDimensions & field_dims, const SpatialSettings & settings) {
-  const auto robot_radius = 0.09;
+  const auto robot_radius = 0.09f;
 
   bool result = 1;
   for(auto i = 0; i < 16; ++i) {

diff --git a/ateam_spatial/src/layers/outside_their_defense_area.cu b/ateam_spatial/src/layers/outside_their_defense_area.cu
@@ -5,10 +5,11 @@ namespace ateam_spatial::layers
 {
 
 CUDA_HOSTDEV float OutsideTheirDefenseArea(const float x, const float y, const FieldDimensions & field_dims, const SpatialSettings & settings) {
-  const auto pos_x_thresh = field_dims.field_length / 2.0;
-  const auto neg_x_thresh = (field_dims.field_length / 2.0) - field_dims.defense_area_depth;
-  const auto pos_y_thresh = field_dims.defense_area_width / 2.0;
-  const auto neg_y_thresh = -field_dims.defense_area_width / 2.0;
+  const auto robot_diameter = 0.180f;
+  const auto pos_x_thresh = field_dims.field_length / 2.0f;
+  const auto neg_x_thresh = (field_dims.field_length / 2.0f) - (field_dims.defense_area_depth + (robot_diameter * 3.0f));
+  const auto pos_y_thresh = (field_dims.defense_area_width / 2.0f) + (robot_diameter * 3.0f);
+  const auto neg_y_thresh = -pos_y_thresh;
 
   return !((x <= pos_x_thresh) && (x >= neg_x_thresh) && (y <= pos_y_thresh) && (y >= neg_y_thresh));
 }

diff --git a/ateam_spatial/src/layers/width_of_shot_on_goal.cu b/ateam_spatial/src/layers/width_of_shot_on_goal.cu
@@ -11,8 +11,8 @@ CUDA_HOSTDEV float WidthOfShotOnGoal(const float x, const float y, const Robot *
   const auto real_x = x;
   const auto real_y = y;
 
-  const auto goal_x = field_dims.field_length / 2.0;
-  const auto goal_y_start = -field_dims.goal_width / 2.0;
+  const auto goal_x = field_dims.field_length / 2.0f;
+  const auto goal_y_start = -field_dims.goal_width / 2.0f;
 
   const auto step_size = field_dims.goal_width / kNumSteps;
 
@@ -30,6 +30,12 @@ CUDA_HOSTDEV float WidthOfShotOnGoal(const float x, const float y, const Robot *
   }
   result = max(result, counter);
 
+  // Avoid shots at shallow angles
+  const auto angle_to_goal = atan2(fabs(goal_x - real_x), fabs(real_y));
+  if (angle_to_goal < 0.7f) {
+    return 0.0f;
+  }
+
   return result * step_size;
 }
 

diff --git a/ateam_spatial/src/maps/receiver_position_quality.cu b/ateam_spatial/src/maps/receiver_position_quality.cu
@@ -15,10 +15,10 @@ CUDA_HOSTDEV float ReceiverPositionQuality(const int spatial_x, const int spatia
 {
   const auto x = SpatialToRealX(spatial_x, field_dims, settings);
   const auto y = SpatialToRealY(spatial_y, field_dims, settings);
-  const auto max_edge_dist = 0.75;
+  const auto max_edge_dist = 1.25f;
   const auto edge_dist_multiplier = min(layers::DistanceFromFieldEdge(x, y, field_dims, settings), max_edge_dist) / max_edge_dist;
   const auto shot_width_multiplier = layers::WidthOfShotOnGoal(x, y, their_bots, field_dims, settings) / field_dims.goal_width;
-  const auto robot_distance_multiplier = min(layers::DistanceToTheirBots(x, y, their_bots, field_dims, settings), 1.0);
+  const auto robot_distance_multiplier = min(layers::DistanceToTheirBots(x, y, their_bots, field_dims, settings), 1.0f);
   return layers::InField(x, y, field_dims, settings)
          * layers::OutsideTheirDefenseArea(x, y, field_dims, settings)
          * layers::LineOfSightBall(x, y, ball, their_bots, field_dims, settings)

diff --git a/ateam_spatial/src/spatial_evaluator.cu b/ateam_spatial/src/spatial_evaluator.cu
@@ -33,7 +33,7 @@ namespace ateam_spatial
 
 SpatialEvaluator::SpatialEvaluator()
 {
-  settings_.resolution = 1e-2; // TODO(barulicm): parameterize this
+  settings_.resolution = 1e-1; // TODO(barulicm): parameterize this
   cuda_device_available_ = IsCudaDeviceAvailable();
   if(!cuda_device_available_) {
     std::cerr << "Could not detect CUDA device. GPU-accelerated spatial code is disabled." << std::endl;

diff --git a/ateam_spatial/test/CMakeLists.txt b/ateam_spatial/test/CMakeLists.txt
@@ -19,3 +19,9 @@ target_include_directories(ateam_spatial_tests PUBLIC
 target_link_libraries(ateam_spatial_tests
   ${PROJECT_NAME}
 )
+
+set_target_properties(ateam_spatial_tests PROPERTIES
+  CUDA_SEPARABLE_COMPILATION ON
+  CUDA_STANDARD 17
+  CUDA_ARCHITECTURES native
+)
diff --git a/ateam_spatial/test/layers_tests.cpp b/ateam_spatial/test/layers_tests.cpp
@@ -199,7 +199,7 @@ TEST(LayersTests, OutsideTheirDefenseArea)
   EXPECT_FLOAT_EQ(OutsideTheirDefenseArea(-8.3, -4.8, field, settings), 1.0);
   EXPECT_FLOAT_EQ(OutsideTheirDefenseArea(7.199, 0.0, field, settings), 0.0);
   EXPECT_FLOAT_EQ(OutsideTheirDefenseArea(7.199, -3.8, field, settings), 1.0);
-  EXPECT_FLOAT_EQ(OutsideTheirDefenseArea(7.199, 1.2, field, settings), 1.0);
+  EXPECT_FLOAT_EQ(OutsideTheirDefenseArea(7.199, 1.6, field, settings), 1.0);
   EXPECT_FLOAT_EQ(OutsideTheirDefenseArea(-7.8, 0.0, field, settings), 1.0);
 }
 
@@ -241,7 +241,7 @@ TEST(LayersTests, WidthOfShotOnGoal)
   EXPECT_NEAR(WidthOfShotOnGoal(0.0, 0.0, their_robots.data(), field, settings), 0.5, kAcceptableError);
   their_robots[0].x = 0.2;
   their_robots[0].y = -0.09;
-  EXPECT_NEAR(WidthOfShotOnGoal(0.0, 0.0, their_robots.data(), field, settings), 0.5, kAcceptableError);
+  EXPECT_NEAR(WidthOfShotOnGoal(0.0, 0.0, their_robots.data(), field, settings), 0.49, kAcceptableError);
   their_robots[0].x = 4.5;
   their_robots[0].y = 0.18;
   EXPECT_NEAR(WidthOfShotOnGoal(0.0, 0.0, their_robots.data(), field, settings), 0.65, kAcceptableError);

diff --git a/ateam_spatial/test/maps_tests.cpp b/ateam_spatial/test/maps_tests.cpp
@@ -57,7 +57,7 @@ TEST(MapsTests, ReceiverPositionQuality)
 
   const auto kAcceptableError = 1e-4;
   EXPECT_FLOAT_EQ(ReceiverPositionQuality(0, 4800, ball, their_robots.data(), field, settings), 0.0);
-  EXPECT_NEAR(ReceiverPositionQuality(600, 4800, ball, their_robots.data(), field, settings), 0.48, kAcceptableError);
+  EXPECT_NEAR(ReceiverPositionQuality(600, 4800, ball, their_robots.data(), field, settings), 0.288, kAcceptableError);
   EXPECT_NEAR(ReceiverPositionQuality(8300, 4800, ball, their_robots.data(), field, settings), 8.3, kAcceptableError);
   EXPECT_NEAR(ReceiverPositionQuality(14500, 4800, ball, their_robots.data(), field, settings), 14.5, kAcceptableError);
   EXPECT_FLOAT_EQ(ReceiverPositionQuality(16600, 4800, ball, their_robots.data(), field, settings), 0.0);

diff --git a/ateam_spatial/test/min_max_loc_kernel_tests.cu b/ateam_spatial/test/min_max_loc_kernel_tests.cu
@@ -34,10 +34,12 @@ TEST(MinMaxLocKernelTests, MinLoc)
   const dim3 grid_size(std::ceil(static_cast<float>(data.size()) / block_size.x));
   ateam_spatial::GpuObject<uint32_t> gpu_min_index;
   ateam_spatial::min_loc_kernel<threads_per_block><<<grid_size, block_size>>>(gpu_data.Get(), gpu_data.Size(), gpu_min_index.Get());
-  if(const auto ret = cudaDeviceSynchronize();ret != cudaSuccess) {
+  if (const auto ret = cudaDeviceSynchronize(); ret != cudaSuccess)
+  {
     FAIL() << "Failed to launch min_loc_kernel: " << cudaGetErrorString(ret);
   }
-  if(const auto ret = cudaGetLastError(); ret != cudaSuccess) {
+  if (const auto ret = cudaGetLastError(); ret != cudaSuccess)
+  {
     FAIL() << "Failed to run min_loc_kernel: " << cudaGetErrorString(ret);
   }
   uint32_t min_index = 0;
@@ -55,10 +57,12 @@ TEST(MinMaxLocKernelTests, MaxLoc)
   const dim3 grid_size(std::ceil(static_cast<float>(data.size()) / block_size.x));
   ateam_spatial::GpuObject<uint32_t> gpu_max_index;
   ateam_spatial::max_loc_kernel<threads_per_block><<<grid_size, block_size>>>(gpu_data.Get(), gpu_data.Size(), gpu_max_index.Get());
-  if(const auto ret = cudaDeviceSynchronize();ret != cudaSuccess) {
+  if (const auto ret = cudaDeviceSynchronize(); ret != cudaSuccess)
+  {
     FAIL() << "Failed to launch max_loc_kernel: " << cudaGetErrorString(ret);
   }
-  if(const auto ret = cudaGetLastError(); ret != cudaSuccess) {
+  if (const auto ret = cudaGetLastError(); ret != cudaSuccess)
+  {
     FAIL() << "Failed to run max_loc_kernel: " << cudaGetErrorString(ret);
   }
   uint32_t max_index = 0;
@@ -72,9 +76,8 @@ TEST(MinMaxLocKernelTests, MaxLoc_MultipleBlocks)
   std::random_device rdev;
   std::default_random_engine randeng(rdev());
   std::uniform_real_distribution<float> randist(0.0, 200.0);
-  std::generate(data.begin(), data.end(), [&](){
-    return randist(randeng);
-  });
+  std::generate(data.begin(), data.end(), [&]()
+                { return randist(randeng); });
   // Fill the first half with zeros to ensure the max is not in the first block
   std::fill_n(data.begin(), data.size() / 2, 0.f);
   ateam_spatial::GpuArray<float, 1440000> gpu_data;
@@ -84,10 +87,46 @@ TEST(MinMaxLocKernelTests, MaxLoc_MultipleBlocks)
   const dim3 grid_size(std::ceil(static_cast<float>(data.size()) / block_size.x));
   ateam_spatial::GpuObject<uint32_t> gpu_max_index;
   ateam_spatial::max_loc_kernel<threads_per_block><<<grid_size, block_size>>>(gpu_data.Get(), gpu_data.Size(), gpu_max_index.Get());
-  if(const auto ret = cudaDeviceSynchronize();ret != cudaSuccess) {
+  if (const auto ret = cudaDeviceSynchronize(); ret != cudaSuccess)
+  {
     FAIL() << "Failed to launch max_loc_kernel: " << cudaGetErrorString(ret);
   }
-  if(const auto ret = cudaGetLastError(); ret != cudaSuccess) {
+  if (const auto ret = cudaGetLastError(); ret != cudaSuccess)
+  {
+    FAIL() << "Failed to run max_loc_kernel: " << cudaGetErrorString(ret);
+  }
+
+  const auto max_iter = std::max_element(data.begin(), data.end());
+
+  ASSERT_NE(max_iter, data.end()) << "Failed to find a max value.";
+  const auto expected_max_index = std::distance(data.begin(), max_iter);
+
+  uint32_t max_index = 0;
+  gpu_max_index.CopyFromGpu(max_index);
+  EXPECT_EQ(max_index, expected_max_index);
+}
+
+TEST(MinMaxLocKernelTests, MaxLoc_UnalignedDataSize)
+{
+  std::array<float, 72> data{};
+  std::random_device rdev;
+  std::default_random_engine randeng(rdev());
+  std::uniform_real_distribution<float> randist(0.0, 200.0);
+  std::generate(data.begin(), data.end(), [&]()
+                { return randist(randeng); });
+  ateam_spatial::GpuArray<float, 72> gpu_data;
+  gpu_data.CopyToGpu(data);
+  constexpr int threads_per_block = 64;
+  const dim3 block_size(threads_per_block);
+  const dim3 grid_size(std::ceil(static_cast<float>(data.size()) / block_size.x));
+  ateam_spatial::GpuObject<uint32_t> gpu_max_index;
+  ateam_spatial::max_loc_kernel<threads_per_block><<<grid_size, block_size>>>(gpu_data.Get(), gpu_data.Size(), gpu_max_index.Get());
+  if (const auto ret = cudaDeviceSynchronize(); ret != cudaSuccess)
+  {
+    FAIL() << "Failed to launch max_loc_kernel: " << cudaGetErrorString(ret);
+  }
+  if (const auto ret = cudaGetLastError(); ret != cudaSuccess)
+  {
     FAIL() << "Failed to run max_loc_kernel: " << cudaGetErrorString(ret);
   }
 

diff --git a/ateam_spatial/test/spatial_evaluator_tests.cpp b/ateam_spatial/test/spatial_evaluator_tests.cpp
@@ -50,10 +50,15 @@ TEST(SpatialEvaluatorTests, Basic)
 
   eval.CopyMapBuffer(MapId::ReceiverPositionQuality, buffer_out);
 
-  EXPECT_EQ(buffer_out.size(), 1'593'600);
+  const auto resolution = eval.GetSettings().resolution;
+  const auto expected_width = (field.field_width + (2 * field.boundary_width)) / resolution;
+  const auto expected_length = (field.field_length + (2 * field.boundary_width)) / resolution;
+  const auto expected_size = expected_length * expected_width;
+
+  EXPECT_EQ(buffer_out.size(), expected_size);
 
   std::vector<uint8_t> rendered_buffer;
   eval.RenderMapBuffer(MapId::ReceiverPositionQuality, rendered_buffer);
 
-  EXPECT_EQ(rendered_buffer.size(), 1'593'600);
+  EXPECT_EQ(rendered_buffer.size(), expected_size);
 }