diff --git a/Project/phase4_testing/notebooks/hw_implementation (2).ipynb b/Project/phase4_testing/notebooks/hw_implementation (2).ipynb
new file mode 100644
index 0000000..dbf767f
--- /dev/null
+++ b/Project/phase4_testing/notebooks/hw_implementation (2).ipynb	
@@ -0,0 +1,558 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "8efd56d1",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Imports successful\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Import needed files\n",
+    "from pynq import Overlay, allocate\n",
+    "import numpy as np\n",
+    "import time\n",
+    "from pathlib import Path\n",
+    "\n",
+    "print(\"Imports successful\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1af395d4",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bitstream loaded successfully!\n",
+      "\n",
+      "Overlay contains: dict_keys(['bnn_top_0', 'axi_dma', 'zynq_ultra_ps_e_0'])\n",
+      "   BNN IP found: <pynq.overlay.DefaultIP object at 0xffff56cb5c90>\n",
+      "      DMA found: <pynq.lib.dma.DMA object at 0xffff57e36650>\n",
+      "   Send channel: <pynq.lib.dma._SDMAChannel object at 0xffff5728bd60>\n",
+      "Receive channel: <pynq.lib.dma._SDMAChannel object at 0xffff5728be20>\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check to see if vivado inport is working\n",
+    "try: \n",
+    "    bnn_overlay = Overlay('/home/xilinx/jupyter_notebooks/MNIST/vivado/bnn_top.bit')\n",
+    "    print(\"Bitstream loaded successfully!\")\n",
+    "    print(f\"\\nOverlay contains: {bnn_overlay.ip_dict.keys()}\")\n",
+    "except Exception as e:\n",
+    "    print(f\"Error loading bitstream: {e}\")\n",
+    "# DMA and IP Setup\n",
+    "try:\n",
+    "    # Get BNN IP\n",
+    "    bnn_ip = bnn_overlay.bnn_top_0\n",
+    "    print(f\"   BNN IP found: {bnn_ip}\")\n",
+    "    # Get DMA\n",
+    "    dma = bnn_overlay.axi_dma\n",
+    "    dma_send = dma.sendchannel\n",
+    "    dma_recv = dma.recvchannel\n",
+    "    print(f\"      DMA found: {dma}\")\n",
+    "    print(f\"   Send channel: {dma_send}\")\n",
+    "    print(f\"Receive channel: {dma_recv}\")\n",
+    "except AttributeError as e:\n",
+    "    print(f\"✗ Error accessing IP blocks: {e}\")\n",
+    "    print(\"\\nAvailable IPs:\", list(bnn_overlay.ip_dict.keys()))\n",
+    "    print(\"\\nMake sure your Vivado block design includes:\")\n",
+    "    print(\"  - bnn_top_0 (your HLS IP)\")\n",
+    "    print(\"  - axi_dma_0 (AXI DMA)\")\n",
+    "    raise"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "39f8ba6b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "L1: 32768 bytes (was 25088, now 32768)\n",
+      "L2: 8192 bytes\n",
+      "L3: 320 bytes\n",
+      "L1: 32768 bytes at 0x780b0000\n",
+      "L2: 8192 bytes at 0x7808c000\n",
+      "L3: 320 bytes at 0x7809c000\n",
+      "Weights loaded.\n",
+      "Send running: True\n",
+      "Recv running: True\n",
+      "Expected: [6, -88, 14, 12, -104, -12, 8, -30, 98, 58]  -> class 8\n",
+      "Got:      [6, -88, 14, 12, -104, -12, 8, -30, 98, 58]  -> class 8\n",
+      "\n",
+      "GOLDEN TEST PASSED\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load weights into PL BRAM\n",
+    "WEIGHT_DIR = \"/home/xilinx/jupyter_notebooks/MNIST/weights\"\n",
+    "\n",
+    "# Register offets from xbnn_top_hw.h\n",
+    "AP_CTRL            = 0x00\n",
+    "L1_WEIGHTS_ADDR    = 0x10   # l1_weights[31:0]  (gmem0 phys addr, low)\n",
+    "L1_WEIGHTS_ADDR_HI = 0x14   # l1_weights[63:32] (gmem0 phys addr, high)\n",
+    "L2_WEIGHTS_ADDR    = 0x1C   # l2_weights[31:0]  (gmem1 phys addr, low)\n",
+    "L2_WEIGHTS_ADDR_HI = 0x20   # l2_weights[63:32] (gmem1 phys addr, high)\n",
+    "L3_WEIGHTS_ADDR    = 0x28   # l3_weights[31:0]  (gmem2 phys addr, low)\n",
+    "L3_WEIGHTS_ADDR_HI = 0x2C   # l3_weights[63:32] (gmem2 phys addr, high)\n",
+    "LOAD_WEIGHTS_OFF   = 0x34   # load_weights[0], bit 0\n",
+    "\n",
+    "INPUT_STREAM_WORDS = 25     # ceil(784 / 32)\n",
+    "\n",
+    "def write_64bit_reg(ip, lo_off, hi_off, value):\n",
+    "    ip.write(lo_off, value & 0xFFFFFFFF)\n",
+    "    ip.write(hi_off, (value >> 32) & 0xFFFFFFFF)\n",
+    "\n",
+    "def pack_weights_to_bytes_aligned(weight_array, row_width_bits, bus_width_bits):\n",
+    "    \"\"\"Pack weights with alignment padding for m_axi bus width.\"\"\"\n",
+    "    n_neurons = weight_array.shape[0]\n",
+    "    row_bytes = (row_width_bits + 7) // 8            # 98 for 784 bits\n",
+    "    aligned_bytes = (bus_width_bits + 7) // 8         # 128 for 1024 bits\n",
+    "    # Use the larger of the two for stride\n",
+    "    stride = max(row_bytes, aligned_bytes)\n",
+    "    \n",
+    "    buf = np.zeros(n_neurons * stride, dtype=np.uint8)\n",
+    "    for n in range(n_neurons):\n",
+    "        for i in range(row_width_bits):\n",
+    "            if weight_array[n][i] < 0:\n",
+    "                byte_idx = n * stride + (i // 8)\n",
+    "                bit_idx  = i % 8\n",
+    "                buf[byte_idx] |= (1 << bit_idx)\n",
+    "    return buf\n",
+    "\n",
+    "# L1: 784-bit rows, 1024-bit bus (from synthesis: 784 -> 1024)\n",
+    "l1_packed = pack_weights_to_bytes_aligned(l1_w, 784, 1024)\n",
+    "# L2: 256-bit rows, 256-bit bus (from synthesis: 256 -> 256, no widening)\n",
+    "l2_packed = pack_weights_to_bytes_aligned(l2_w, 256, 256)\n",
+    "# L3: 256-bit rows, 256-bit bus (from synthesis: 256 -> 256, no widening)\n",
+    "l3_packed = pack_weights_to_bytes_aligned(l3_w, 256, 256)\n",
+    "\n",
+    "print(f\"L1: {len(l1_packed)} bytes (was 25088, now {256*128})\")\n",
+    "print(f\"L2: {len(l2_packed)} bytes\")\n",
+    "print(f\"L3: {len(l3_packed)} bytes\")\n",
+    "\n",
+    "# Load overlay\n",
+    "bnn_overlay = Overlay('/home/xilinx/jupyter_notebooks/MNIST/vivado/bnn_top.bit')\n",
+    "bnn_ip = bnn_overlay.bnn_top_0\n",
+    "dma = bnn_overlay.axi_dma\n",
+    "\n",
+    "# Load weights FRESH\n",
+    "l1_w = np.load(f\"{WEIGHT_DIR}/fc1_weights.npy\")\n",
+    "l2_w = np.load(f\"{WEIGHT_DIR}/fc2_weights.npy\")\n",
+    "l3_w = np.load(f\"{WEIGHT_DIR}/fc3_weights.npy\")\n",
+    "\n",
+    "l1_buf = allocate(shape=(len(l1_packed),), dtype=np.uint8)\n",
+    "l2_buf = allocate(shape=(len(l2_packed),), dtype=np.uint8)\n",
+    "l3_buf = allocate(shape=(len(l3_packed),), dtype=np.uint8)\n",
+    "np.copyto(l1_buf, l1_packed)\n",
+    "np.copyto(l2_buf, l2_packed)\n",
+    "np.copyto(l3_buf, l3_packed)\n",
+    "\n",
+    "print(f\"L1: {len(l1_packed)} bytes at 0x{l1_buf.physical_address:08x}\")\n",
+    "print(f\"L2: {len(l2_packed)} bytes at 0x{l2_buf.physical_address:08x}\")\n",
+    "print(f\"L3: {len(l3_packed)} bytes at 0x{l3_buf.physical_address:08x}\")\n",
+    "\n",
+    "# Load weights into BRAM\n",
+    "bnn_ip.write(LOAD_WEIGHTS_OFF, 1)\n",
+    "write_64bit_reg(bnn_ip, L1_WEIGHTS_ADDR, L1_WEIGHTS_ADDR_HI, l1_buf.physical_address)\n",
+    "write_64bit_reg(bnn_ip, L2_WEIGHTS_ADDR, L2_WEIGHTS_ADDR_HI, l2_buf.physical_address)\n",
+    "write_64bit_reg(bnn_ip, L3_WEIGHTS_ADDR, L3_WEIGHTS_ADDR_HI, l3_buf.physical_address)\n",
+    "bnn_ip.write(AP_CTRL, 0x01)\n",
+    "while (bnn_ip.read(AP_CTRL) & 0x02) == 0:\n",
+    "    pass\n",
+    "print(\"Weights loaded.\")\n",
+    "\n",
+    "# Refresh DMA without reprogramming\n",
+    "bnn_overlay = Overlay('/home/xilinx/jupyter_notebooks/MNIST/vivado/bnn_top.bit', download=False)\n",
+    "bnn_ip = bnn_overlay.bnn_top_0\n",
+    "dma = bnn_overlay.axi_dma\n",
+    "dma_send = dma.sendchannel\n",
+    "dma_recv = dma.recvchannel\n",
+    "print(f\"Send running: {dma_send.running}\")\n",
+    "print(f\"Recv running: {dma_recv.running}\")\n",
+    "\n",
+    "# Inference function\n",
+    "def run_inference(image_flat):\n",
+    "    input_buf  = allocate(shape=(INPUT_STREAM_WORDS,), dtype=np.uint32)\n",
+    "    output_buf = allocate(shape=(10,), dtype=np.int16)\n",
+    "\n",
+    "    for w in range(INPUT_STREAM_WORDS):\n",
+    "        val = 0\n",
+    "        for b in range(32):\n",
+    "            idx = w * 32 + b\n",
+    "            if idx < 784 and image_flat[idx] == 0:\n",
+    "                val |= (1 << b)\n",
+    "        input_buf[w] = val\n",
+    "\n",
+    "    bnn_ip.write(LOAD_WEIGHTS_OFF, 0)\n",
+    "    dma_recv.transfer(output_buf)\n",
+    "    bnn_ip.write(AP_CTRL, 0x01)\n",
+    "    dma_send.transfer(input_buf)\n",
+    "\n",
+    "    dma_send.wait()\n",
+    "    dma_recv.wait()\n",
+    "\n",
+    "    scores = output_buf.copy()\n",
+    "    predicted = int(np.argmax(scores))\n",
+    "\n",
+    "    del input_buf, output_buf\n",
+    "    return predicted, scores\n",
+    "\n",
+    "\n",
+    "test_image = np.ones(784, dtype=np.uint8)\n",
+    "predicted, scores = run_inference(test_image)\n",
+    "\n",
+    "expected = [6, -88, 14, 12, -104, -12, 8, -30, 98, 58]\n",
+    "print(f\"Expected: {expected}  -> class 8\")\n",
+    "print(f\"Got:      {list(scores)}  -> class {predicted}\")\n",
+    "\n",
+    "if list(scores) == expected and predicted == 8:\n",
+    "    print(\"\\TEST PASSED\")\n",
+    "else:\n",
+    "    print(\"\\nFAILED\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "9cca4a09",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# HW Implementation setup\n",
+    "def fpga_hardware_testbench(test_inputs, test_labels):\n",
+    "    \"\"\"\n",
+    "    Processes all MNIST test images one at a time.\n",
+    "    Measures timing, accuracy, and throughput.\n",
+    "\n",
+    "    Args:\n",
+    "        test_inputs: (N, 784) uint8 array, values 0 or 1\n",
+    "        test_labels: (N,) int array, values 0-9\n",
+    "\n",
+    "    Returns:\n",
+    "        results: Dictionary with all metrics\n",
+    "    \"\"\"\n",
+    "\n",
+    "    N = len(test_inputs)\n",
+    "\n",
+    "    # Warm-up: run a few inferences to stabilize DMA/cache behavior\n",
+    "    dummy_img = np.ones(784, dtype=np.uint8)\n",
+    "    for _ in range(10):\n",
+    "        run_inference(dummy_img)\n",
+    "    print(\"Warm-up complete\")\n",
+    "\n",
+    "    # TESTBENCH\n",
+    "    print(f\"\\nProcessing {N:,} test images...\")\n",
+    "    print(\"-\" * 70)\n",
+    "\n",
+    "    correct = 0\n",
+    "    total = 0\n",
+    "    individual_times = []\n",
+    "\n",
+    "    start_total = time.perf_counter()\n",
+    "\n",
+    "    for i in range(N):\n",
+    "        start_img = time.perf_counter()\n",
+    "        pred, _ = run_inference(test_inputs[i])\n",
+    "        end_img = time.perf_counter()\n",
+    "\n",
+    "        individual_times.append(end_img - start_img)\n",
+    "        total += 1\n",
+    "        if pred == test_labels[i]:\n",
+    "            correct += 1\n",
+    "\n",
+    "        # Progress update (matches batch reporting cadence)\n",
+    "        if (i + 1) % 1000 == 0:\n",
+    "            elapsed = time.perf_counter() - start_total\n",
+    "            print(f\"  [{i+1:>5}/{N}]  acc so far: {100*correct/(i+1):.2f}%  \"\n",
+    "                  f\"elapsed: {elapsed:.1f}s\")\n",
+    "\n",
+    "    end_total = time.perf_counter()\n",
+    "\n",
+    "    # METRICS\n",
+    "    total_time = end_total - start_total\n",
+    "    accuracy = 100.0 * correct / total\n",
+    "    avg_time_per_image_ms = (total_time / total) * 1000\n",
+    "    throughput_images_per_sec = total / total_time\n",
+    "\n",
+    "    individual_times = np.array(individual_times)\n",
+    "    avg_img_time_ms = np.mean(individual_times) * 1000\n",
+    "    min_img_time_ms = np.min(individual_times) * 1000\n",
+    "    max_img_time_ms = np.max(individual_times) * 1000\n",
+    "\n",
+    "    # Computational metrics (same op count as SW baseline)\n",
+    "    ops_fc1 = 784 * 256\n",
+    "    ops_fc2 = 256 * 256\n",
+    "    ops_fc3 = 256 * 10\n",
+    "    total_ops_per_inference = ops_fc1 + ops_fc2 + ops_fc3  # 268,800\n",
+    "    total_ops_executed = total_ops_per_inference * total\n",
+    "    ops_per_second = total_ops_per_inference * throughput_images_per_sec\n",
+    "    gops_per_second = ops_per_second / 1e9\n",
+    "\n",
+    "    print(\"\\n\" + \"=\" * 70)\n",
+    "    print(\"RESULTS\")\n",
+    "    print(\"=\" * 70)\n",
+    "\n",
+    "    print(\"\\n1. ACCURACY RESULTS:\")\n",
+    "    print(f\"   Total images processed:  {total:,}\")\n",
+    "    print(f\"   Correct predictions:     {correct:,}\")\n",
+    "    print(f\"   Incorrect predictions:   {total - correct:,}\")\n",
+    "    print(f\"   Accuracy:                {accuracy:.2f}%\")\n",
+    "\n",
+    "    print(\"\\n2. TIMING RESULTS:\")\n",
+    "    print(f\"   Total execution time:    {total_time:.4f} seconds\")\n",
+    "    print(f\"   Average time per image:  {avg_time_per_image_ms:.4f} ms\")\n",
+    "    print(f\"   Throughput:              {throughput_images_per_sec:.2f} images/second\")\n",
+    "\n",
+    "    print(\"\\n3. PER-IMAGE TIMING STATISTICS:\")\n",
+    "    print(f\"   Total images:            {total:,}\")\n",
+    "    print(f\"   Average image time:      {avg_img_time_ms:.4f} ms\")\n",
+    "    print(f\"   Min image time:          {min_img_time_ms:.4f} ms\")\n",
+    "    print(f\"   Max image time:          {max_img_time_ms:.4f} ms\")\n",
+    "\n",
+    "    print(\"\\n4. COMPUTATIONAL ANALYSIS:\")\n",
+    "    print(f\"   Operations per image:    {total_ops_per_inference:,}\")\n",
+    "    print(f\"   Total operations:        {total_ops_executed:,}\")\n",
+    "    print(f\"   Operations per second:   {ops_per_second:.2e}\")\n",
+    "    print(f\"   Throughput:              {gops_per_second:.4f} GOPS\")\n",
+    "\n",
+    "    results = {\n",
+    "        'accuracy': {\n",
+    "            'total_images': total,\n",
+    "            'correct': correct,\n",
+    "            'incorrect': total - correct,\n",
+    "            'accuracy_percent': float(accuracy)\n",
+    "        },\n",
+    "        'timing': {\n",
+    "            'total_time_sec': float(total_time),\n",
+    "            'avg_time_per_image_ms': float(avg_time_per_image_ms),\n",
+    "            'throughput_images_per_sec': float(throughput_images_per_sec)\n",
+    "        },\n",
+    "        'per_image_stats': {\n",
+    "            'avg_time_ms': float(avg_img_time_ms),\n",
+    "            'min_time_ms': float(min_img_time_ms),\n",
+    "            'max_time_ms': float(max_img_time_ms)\n",
+    "        },\n",
+    "        'computational': {\n",
+    "            'ops_per_image': total_ops_per_inference,\n",
+    "            'total_ops_executed': total_ops_executed,\n",
+    "            'ops_per_second': float(ops_per_second),\n",
+    "            'gops_per_second': float(gops_per_second)\n",
+    "        }\n",
+    "    }\n",
+    "\n",
+    "    return results\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "9d9742d6",
+   "metadata": {
+    "vscode": {
+     "languageId": "r"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Test data: (10000, 784), dtype=uint8\n",
+      "Labels:    (10000,), dtype=int64\n",
+      "Unique input values: [0 1]\n",
+      "Warm-up complete\n",
+      "\n",
+      "Processing 10,000 test images...\n",
+      "----------------------------------------------------------------------\n",
+      "  [ 1000/10000]  acc so far: 19.00%  elapsed: 14.3s\n",
+      "  [ 2000/10000]  acc so far: 18.40%  elapsed: 28.5s\n",
+      "  [ 3000/10000]  acc so far: 17.73%  elapsed: 42.8s\n",
+      "  [ 4000/10000]  acc so far: 17.62%  elapsed: 57.1s\n",
+      "  [ 5000/10000]  acc so far: 17.36%  elapsed: 71.3s\n",
+      "  [ 6000/10000]  acc so far: 17.02%  elapsed: 85.5s\n",
+      "  [ 7000/10000]  acc so far: 16.73%  elapsed: 99.7s\n",
+      "  [ 8000/10000]  acc so far: 16.16%  elapsed: 113.9s\n",
+      "  [ 9000/10000]  acc so far: 16.00%  elapsed: 128.1s\n",
+      "  [10000/10000]  acc so far: 15.68%  elapsed: 142.4s\n",
+      "\n",
+      "======================================================================\n",
+      "RESULTS\n",
+      "======================================================================\n",
+      "\n",
+      "1. ACCURACY RESULTS:\n",
+      "   Total images processed:  10,000\n",
+      "   Correct predictions:     1,568\n",
+      "   Incorrect predictions:   8,432\n",
+      "   Accuracy:                15.68%\n",
+      "\n",
+      "2. TIMING RESULTS:\n",
+      "   Total execution time:    142.3650 seconds\n",
+      "   Average time per image:  14.2365 ms\n",
+      "   Throughput:              70.24 images/second\n",
+      "\n",
+      "3. PER-IMAGE TIMING STATISTICS:\n",
+      "   Total images:            10,000\n",
+      "   Average image time:      14.2231 ms\n",
+      "   Min image time:          13.9881 ms\n",
+      "   Max image time:          19.3802 ms\n",
+      "\n",
+      "4. COMPUTATIONAL ANALYSIS:\n",
+      "   Operations per image:    268,800\n",
+      "   Total operations:        2,688,000,000\n",
+      "   Operations per second:   1.89e+07\n",
+      "   Throughput:              0.0189 GOPS\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load binarized test data (exported from PC)\n",
+    "test_inputs = np.load(f\"/home/xilinx/jupyter_notebooks/MNIST/database/test_inputs_binarized.npy\")  # (10000, 784) uint8\n",
+    "test_labels = np.load(f\"/home/xilinx/jupyter_notebooks/MNIST/database/test_labels_full.npy\")       # (10000,) int64\n",
+    "\n",
+    "print(f\"Test data: {test_inputs.shape}, dtype={test_inputs.dtype}\")\n",
+    "print(f\"Labels:    {test_labels.shape}, dtype={test_labels.dtype}\")\n",
+    "print(f\"Unique input values: {np.unique(test_inputs)}\")\n",
+    "\n",
+    "# Run the testbench\n",
+    "fpga_results = fpga_hardware_testbench(test_inputs, test_labels)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "857fb949",
+   "metadata": {
+    "vscode": {
+     "languageId": "r"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Compare results\n",
+    "sw_cpu_latency_ms  = 0.1209\n",
+    "sw_cpu_throughput   = 8269.89\n",
+    "sw_cpu_gops         = 2.2229\n",
+    "\n",
+    "sw_gpu_latency_ms  = 0.1086\n",
+    "sw_gpu_throughput   = 9209.52\n",
+    "sw_gpu_gops         = 2.4755\n",
+    "\n",
+    "fpga_latency_ms    = fpga_results['timing']['avg_time_per_image_ms']\n",
+    "fpga_throughput    = fpga_results['timing']['throughput_images_per_sec']\n",
+    "fpga_gops          = fpga_results['computational']['gops_per_second']\n",
+    "fpga_accuracy      = fpga_results['accuracy']['accuracy_percent']\n",
+    "\n",
+    "print(\"=\" * 70)\n",
+    "print(\"FPGA vs SOFTWARE BASELINE COMPARISON\")\n",
+    "print(\"=\" * 70)\n",
+    "\n",
+    "print(f\"\\n{'Platform':<35} {'Latency(ms)':<14} {'Throughput':<16} {'GOPS':<10} {'Accuracy'}\")\n",
+    "print(\"-\" * 90)\n",
+    "print(f\"{'PC CPU (PyTorch, x86)':<35} {sw_cpu_latency_ms:<14.4f} {sw_cpu_throughput:<16.2f} {sw_cpu_gops:<10.4f} {'98.50%'}\")\n",
+    "print(f\"{'PC GPU (PyTorch, RTX 5070 Ti)':<35} {sw_gpu_latency_ms:<14.4f} {sw_gpu_throughput:<16.2f} {sw_gpu_gops:<10.4f} {'98.50%'}\")\n",
+    "print(f\"{'FPGA (XCZU3EG, this design)':<35} {fpga_latency_ms:<14.4f} {fpga_throughput:<16.2f} {fpga_gops:<10.4f} {fpga_accuracy:.2f}%\")\n",
+    "\n",
+    "print(f\"\\nSpeedup vs PC CPU:  {sw_cpu_latency_ms / fpga_latency_ms:.2f}x\")\n",
+    "print(f\"Speedup vs PC GPU:  {sw_gpu_latency_ms / fpga_latency_ms:.2f}x\")\n",
+    "\n",
+    "print(f\"\\nNote: FPGA latency includes Python overhead + DMA transfers + kernel.\")\n",
+    "print(f\"      Kernel-only latency: ~0.97 us (322 cycles @ 300 MHz)\")\n",
+    "print(\"=\" * 70)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08c60a03",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Free buffers\n",
+    "l1_buf.freebuffer()\n",
+    "l2_buf.freebuffer()\n",
+    "l3_buf.freebuffer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed7a1493",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69f1a273",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "026bf6f2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}