umiswing · baoqiwen · May 26, 2026
diff --git a/benchmark_flashmask.py b/benchmark_flashmask.py
@@ -11,6 +11,8 @@
     from paddle.nn.functional.flash_attention import flashmask_attention
 import random
 import os
+import gc
+
 from datetime import datetime
 
 np.random.seed(0)
@@ -89,7 +91,11 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, fast_flu
     # Warm-up
     for _ in range(n_warmup):
         fn()
+
     # Benchmark
+    # Disable Python GC during timed iterations
+    gc.collect()
+    gc.disable()
     for i in range(n_repeat):
         # we don't want `fn` to accumulate gradient values
         # if it contains a backward pass. So we clear the
@@ -103,6 +109,8 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, fast_flu
         start_event[i].record()
         fn()
         end_event[i].record()
+    gc.enable()
+
     # Record clocks
     paddle.device.synchronize()
     times = paddle.to_tensor([s.elapsed_time(e) for s, e in zip(start_event, end_event)], dtype=paddle.float32)