NotShrirang · NotShrirang · Dec 19, 2025 · Dec 15, 2025
diff --git a/README.md b/README.md
@@ -218,13 +218,27 @@ tensorax/
 
 ## ⚡ Performance
 
-Tensorax uses hand-optimized CUDA kernels for maximum performance:
+Tensorax uses hand-optimized CUDA kernels for maximum performance. Here are some benchmark results for matrix multiplication (fp32, 3x1024×1024):
 
-| Operation       | Matrix Size | CPU Time   | CUDA Time | Speedup    |
-| --------------- | ----------- | ---------- | --------- | ---------- |
-| Matrix Multiply | 64×64       | 0.09 ms    | 0.02 ms   | **3.8x**   |
-| Matrix Multiply | 128×128     | 0.83 ms    | 0.05 ms   | **17.1x**  |
-| Matrix Multiply | 1024×1024   | 2382.89 ms | 5.31 ms   | **448.7x** |
+### Matrix Multiplication Benchmark (100 runs)
+
+Comparison of different CUDA kernel implementations vs NumPy and PyTorch:
+
+| Implementation               | Time (seconds) | Relative Performance |
+| ---------------------------- | -------------- | -------------------- |
+| **1D Block Tiling (Best)**   | 0.95           | **2.31x faster**     |
+| Tiled Matrix Multiply        | 1.22           | **1.80x faster**     |
+| NumPy (CPU)                  | 1.85           | Baseline (CPU)       |
+| Shared Memory Cache Blocking | 2.18           | 0.85x                |
+| Default CUDA                 | 3.37           | 0.55x                |
+| Shared Memory Coalescing     | 3.44           | 0.54x                |
+| **PyTorch CUDA (Reference)** | **0.41**       | **4.51x faster**     |
+
+**Key Insights:**
+
+- Our 1D block tiling implementation achieves **2.31x faster** performance than NumPy
+- Performance is **43% of PyTorch's highly optimized CUDA kernels** (room for improvement)
+- Tiled approaches consistently outperform naive implementations by **1.5-3x**
 
 ### Optimization Techniques
 

diff --git a/benchmarks/matmul_benchmark.py b/benchmarks/matmul_benchmark.py
@@ -1,5 +1,6 @@
 import timeit
 
+import numpy as np
 import torch
 import tensorax as ts
 
@@ -11,35 +12,53 @@
 a_t = ts.Tensor(a_torch.cpu().numpy(), dtype='float32', device='cuda')
 b_t = ts.Tensor(b_torch.cpu().numpy(), dtype='float32', device='cuda')
 
+a_np = a_torch.cpu().numpy()
+b_np = b_torch.cpu().numpy()
 
 # Benchmarking matmul with shared memory coalescing
 def matmul_shared_memory_coalesced():
+    torch.cuda.synchronize()
     c = a_t.matmul(b_t, method="shared_memory_coalesced")
+    torch.cuda.synchronize()
     return c
 
 # Benchmarking default matmul
 def matmul_default():
+    torch.cuda.synchronize()
     c = a_t.matmul(b_t, method="default")
+    torch.cuda.synchronize()
     return c
 
 # Benchmarking tiled matmul
 def matmul_tiled():
+    torch.cuda.synchronize()
     c = a_t.matmul(b_t, method="tiled")
+    torch.cuda.synchronize()
     return c
 
 # Benchmarking matmul with shared memory cache blocking
 def matmul_cache_blocking():
+    torch.cuda.synchronize()
     c = a_t.matmul(b_t, method="shared_memory_cache_blocking")
+    torch.cuda.synchronize()
     return c
 
 # Benchmarking matmul with 1D block tiling
 def matmul_1d_block_tiling():
+    torch.cuda.synchronize()
     c = a_t.matmul(b_t, method="block_tiling_1d")
+    torch.cuda.synchronize()
     return c
 
+def matmul_numpy():
+    c_np = np.matmul(a_np, b_np)
+    return c_np
+
 # Benchmarking PyTorch matmul
 def matmul_pytorch():
+    torch.cuda.synchronize()
     c = torch.matmul(a_torch, b_torch)
+    torch.cuda.synchronize()
     return c
 
 # Warm-up run
@@ -49,6 +68,7 @@ def matmul_pytorch():
 matmul_tiled()
 matmul_cache_blocking()
 matmul_1d_block_tiling()
+matmul_numpy()
 matmul_pytorch()
 print("Warm-up done.")
 
@@ -69,5 +89,8 @@ def matmul_pytorch():
 time_1d_block_tiling = timeit.timeit(matmul_1d_block_tiling, number=times)
 print(f"Matmul with 1D block tiling time over {times} runs: {time_1d_block_tiling} seconds")
 
+time_numpy = timeit.timeit(matmul_numpy, number=times)
+print(f"Numpy matmul time over {times} runs: {time_numpy} seconds")
+
 time_pytorch = timeit.timeit(matmul_pytorch, number=times)
 print(f"PyTorch matmul time over {times} runs: {time_pytorch} seconds")