diff --git a/Project/phase4_testing/notebooks/hw_implementation (2).ipynb b/Project/phase4_testing/notebooks/hw_implementation (2).ipynb new file mode 100644 index 0000000..dbf767f --- /dev/null +++ b/Project/phase4_testing/notebooks/hw_implementation (2).ipynb @@ -0,0 +1,558 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "id": "8efd56d1", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Imports successful\n" + ] + } + ], + "source": [ + "# Import needed files\n", + "from pynq import Overlay, allocate\n", + "import numpy as np\n", + "import time\n", + "from pathlib import Path\n", + "\n", + "print(\"Imports successful\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1af395d4", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bitstream loaded successfully!\n", + "\n", + "Overlay contains: dict_keys(['bnn_top_0', 'axi_dma', 'zynq_ultra_ps_e_0'])\n", + " BNN IP found: \n", + " DMA found: \n", + " Send channel: \n", + "Receive channel: \n" + ] + } + ], + "source": [ + "# Check to see if vivado inport is working\n", + "try: \n", + " bnn_overlay = Overlay('/home/xilinx/jupyter_notebooks/MNIST/vivado/bnn_top.bit')\n", + " print(\"Bitstream loaded successfully!\")\n", + " print(f\"\\nOverlay contains: {bnn_overlay.ip_dict.keys()}\")\n", + "except Exception as e:\n", + " print(f\"Error loading bitstream: {e}\")\n", + "# DMA and IP Setup\n", + "try:\n", + " # Get BNN IP\n", + " bnn_ip = bnn_overlay.bnn_top_0\n", + " print(f\" BNN IP found: {bnn_ip}\")\n", + " # Get DMA\n", + " dma = bnn_overlay.axi_dma\n", + " dma_send = dma.sendchannel\n", + " dma_recv = dma.recvchannel\n", + " print(f\" DMA found: {dma}\")\n", + " print(f\" Send channel: {dma_send}\")\n", + " print(f\"Receive channel: {dma_recv}\")\n", + "except AttributeError as e:\n", + " print(f\"✗ Error accessing IP blocks: {e}\")\n", + " print(\"\\nAvailable IPs:\", list(bnn_overlay.ip_dict.keys()))\n", + " print(\"\\nMake sure your Vivado block design includes:\")\n", + " print(\" - bnn_top_0 (your HLS IP)\")\n", + " print(\" - axi_dma_0 (AXI DMA)\")\n", + " raise" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "39f8ba6b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "L1: 32768 bytes (was 25088, now 32768)\n", + "L2: 8192 bytes\n", + "L3: 320 bytes\n", + "L1: 32768 bytes at 0x780b0000\n", + "L2: 8192 bytes at 0x7808c000\n", + "L3: 320 bytes at 0x7809c000\n", + "Weights loaded.\n", + "Send running: True\n", + "Recv running: True\n", + "Expected: [6, -88, 14, 12, -104, -12, 8, -30, 98, 58] -> class 8\n", + "Got: [6, -88, 14, 12, -104, -12, 8, -30, 98, 58] -> class 8\n", + "\n", + "GOLDEN TEST PASSED\n" + ] + } + ], + "source": [ + "# Load weights into PL BRAM\n", + "WEIGHT_DIR = \"/home/xilinx/jupyter_notebooks/MNIST/weights\"\n", + "\n", + "# Register offets from xbnn_top_hw.h\n", + "AP_CTRL = 0x00\n", + "L1_WEIGHTS_ADDR = 0x10 # l1_weights[31:0] (gmem0 phys addr, low)\n", + "L1_WEIGHTS_ADDR_HI = 0x14 # l1_weights[63:32] (gmem0 phys addr, high)\n", + "L2_WEIGHTS_ADDR = 0x1C # l2_weights[31:0] (gmem1 phys addr, low)\n", + "L2_WEIGHTS_ADDR_HI = 0x20 # l2_weights[63:32] (gmem1 phys addr, high)\n", + "L3_WEIGHTS_ADDR = 0x28 # l3_weights[31:0] (gmem2 phys addr, low)\n", + "L3_WEIGHTS_ADDR_HI = 0x2C # l3_weights[63:32] (gmem2 phys addr, high)\n", + "LOAD_WEIGHTS_OFF = 0x34 # load_weights[0], bit 0\n", + "\n", + "INPUT_STREAM_WORDS = 25 # ceil(784 / 32)\n", + "\n", + "def write_64bit_reg(ip, lo_off, hi_off, value):\n", + " ip.write(lo_off, value & 0xFFFFFFFF)\n", + " ip.write(hi_off, (value >> 32) & 0xFFFFFFFF)\n", + "\n", + "def pack_weights_to_bytes_aligned(weight_array, row_width_bits, bus_width_bits):\n", + " \"\"\"Pack weights with alignment padding for m_axi bus width.\"\"\"\n", + " n_neurons = weight_array.shape[0]\n", + " row_bytes = (row_width_bits + 7) // 8 # 98 for 784 bits\n", + " aligned_bytes = (bus_width_bits + 7) // 8 # 128 for 1024 bits\n", + " # Use the larger of the two for stride\n", + " stride = max(row_bytes, aligned_bytes)\n", + " \n", + " buf = np.zeros(n_neurons * stride, dtype=np.uint8)\n", + " for n in range(n_neurons):\n", + " for i in range(row_width_bits):\n", + " if weight_array[n][i] < 0:\n", + " byte_idx = n * stride + (i // 8)\n", + " bit_idx = i % 8\n", + " buf[byte_idx] |= (1 << bit_idx)\n", + " return buf\n", + "\n", + "# L1: 784-bit rows, 1024-bit bus (from synthesis: 784 -> 1024)\n", + "l1_packed = pack_weights_to_bytes_aligned(l1_w, 784, 1024)\n", + "# L2: 256-bit rows, 256-bit bus (from synthesis: 256 -> 256, no widening)\n", + "l2_packed = pack_weights_to_bytes_aligned(l2_w, 256, 256)\n", + "# L3: 256-bit rows, 256-bit bus (from synthesis: 256 -> 256, no widening)\n", + "l3_packed = pack_weights_to_bytes_aligned(l3_w, 256, 256)\n", + "\n", + "print(f\"L1: {len(l1_packed)} bytes (was 25088, now {256*128})\")\n", + "print(f\"L2: {len(l2_packed)} bytes\")\n", + "print(f\"L3: {len(l3_packed)} bytes\")\n", + "\n", + "# Load overlay\n", + "bnn_overlay = Overlay('/home/xilinx/jupyter_notebooks/MNIST/vivado/bnn_top.bit')\n", + "bnn_ip = bnn_overlay.bnn_top_0\n", + "dma = bnn_overlay.axi_dma\n", + "\n", + "# Load weights FRESH\n", + "l1_w = np.load(f\"{WEIGHT_DIR}/fc1_weights.npy\")\n", + "l2_w = np.load(f\"{WEIGHT_DIR}/fc2_weights.npy\")\n", + "l3_w = np.load(f\"{WEIGHT_DIR}/fc3_weights.npy\")\n", + "\n", + "l1_buf = allocate(shape=(len(l1_packed),), dtype=np.uint8)\n", + "l2_buf = allocate(shape=(len(l2_packed),), dtype=np.uint8)\n", + "l3_buf = allocate(shape=(len(l3_packed),), dtype=np.uint8)\n", + "np.copyto(l1_buf, l1_packed)\n", + "np.copyto(l2_buf, l2_packed)\n", + "np.copyto(l3_buf, l3_packed)\n", + "\n", + "print(f\"L1: {len(l1_packed)} bytes at 0x{l1_buf.physical_address:08x}\")\n", + "print(f\"L2: {len(l2_packed)} bytes at 0x{l2_buf.physical_address:08x}\")\n", + "print(f\"L3: {len(l3_packed)} bytes at 0x{l3_buf.physical_address:08x}\")\n", + "\n", + "# Load weights into BRAM\n", + "bnn_ip.write(LOAD_WEIGHTS_OFF, 1)\n", + "write_64bit_reg(bnn_ip, L1_WEIGHTS_ADDR, L1_WEIGHTS_ADDR_HI, l1_buf.physical_address)\n", + "write_64bit_reg(bnn_ip, L2_WEIGHTS_ADDR, L2_WEIGHTS_ADDR_HI, l2_buf.physical_address)\n", + "write_64bit_reg(bnn_ip, L3_WEIGHTS_ADDR, L3_WEIGHTS_ADDR_HI, l3_buf.physical_address)\n", + "bnn_ip.write(AP_CTRL, 0x01)\n", + "while (bnn_ip.read(AP_CTRL) & 0x02) == 0:\n", + " pass\n", + "print(\"Weights loaded.\")\n", + "\n", + "# Refresh DMA without reprogramming\n", + "bnn_overlay = Overlay('/home/xilinx/jupyter_notebooks/MNIST/vivado/bnn_top.bit', download=False)\n", + "bnn_ip = bnn_overlay.bnn_top_0\n", + "dma = bnn_overlay.axi_dma\n", + "dma_send = dma.sendchannel\n", + "dma_recv = dma.recvchannel\n", + "print(f\"Send running: {dma_send.running}\")\n", + "print(f\"Recv running: {dma_recv.running}\")\n", + "\n", + "# Inference function\n", + "def run_inference(image_flat):\n", + " input_buf = allocate(shape=(INPUT_STREAM_WORDS,), dtype=np.uint32)\n", + " output_buf = allocate(shape=(10,), dtype=np.int16)\n", + "\n", + " for w in range(INPUT_STREAM_WORDS):\n", + " val = 0\n", + " for b in range(32):\n", + " idx = w * 32 + b\n", + " if idx < 784 and image_flat[idx] == 0:\n", + " val |= (1 << b)\n", + " input_buf[w] = val\n", + "\n", + " bnn_ip.write(LOAD_WEIGHTS_OFF, 0)\n", + " dma_recv.transfer(output_buf)\n", + " bnn_ip.write(AP_CTRL, 0x01)\n", + " dma_send.transfer(input_buf)\n", + "\n", + " dma_send.wait()\n", + " dma_recv.wait()\n", + "\n", + " scores = output_buf.copy()\n", + " predicted = int(np.argmax(scores))\n", + "\n", + " del input_buf, output_buf\n", + " return predicted, scores\n", + "\n", + "\n", + "test_image = np.ones(784, dtype=np.uint8)\n", + "predicted, scores = run_inference(test_image)\n", + "\n", + "expected = [6, -88, 14, 12, -104, -12, 8, -30, 98, 58]\n", + "print(f\"Expected: {expected} -> class 8\")\n", + "print(f\"Got: {list(scores)} -> class {predicted}\")\n", + "\n", + "if list(scores) == expected and predicted == 8:\n", + " print(\"\\TEST PASSED\")\n", + "else:\n", + " print(\"\\nFAILED\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "9cca4a09", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# HW Implementation setup\n", + "def fpga_hardware_testbench(test_inputs, test_labels):\n", + " \"\"\"\n", + " Processes all MNIST test images one at a time.\n", + " Measures timing, accuracy, and throughput.\n", + "\n", + " Args:\n", + " test_inputs: (N, 784) uint8 array, values 0 or 1\n", + " test_labels: (N,) int array, values 0-9\n", + "\n", + " Returns:\n", + " results: Dictionary with all metrics\n", + " \"\"\"\n", + "\n", + " N = len(test_inputs)\n", + "\n", + " # Warm-up: run a few inferences to stabilize DMA/cache behavior\n", + " dummy_img = np.ones(784, dtype=np.uint8)\n", + " for _ in range(10):\n", + " run_inference(dummy_img)\n", + " print(\"Warm-up complete\")\n", + "\n", + " # TESTBENCH\n", + " print(f\"\\nProcessing {N:,} test images...\")\n", + " print(\"-\" * 70)\n", + "\n", + " correct = 0\n", + " total = 0\n", + " individual_times = []\n", + "\n", + " start_total = time.perf_counter()\n", + "\n", + " for i in range(N):\n", + " start_img = time.perf_counter()\n", + " pred, _ = run_inference(test_inputs[i])\n", + " end_img = time.perf_counter()\n", + "\n", + " individual_times.append(end_img - start_img)\n", + " total += 1\n", + " if pred == test_labels[i]:\n", + " correct += 1\n", + "\n", + " # Progress update (matches batch reporting cadence)\n", + " if (i + 1) % 1000 == 0:\n", + " elapsed = time.perf_counter() - start_total\n", + " print(f\" [{i+1:>5}/{N}] acc so far: {100*correct/(i+1):.2f}% \"\n", + " f\"elapsed: {elapsed:.1f}s\")\n", + "\n", + " end_total = time.perf_counter()\n", + "\n", + " # METRICS\n", + " total_time = end_total - start_total\n", + " accuracy = 100.0 * correct / total\n", + " avg_time_per_image_ms = (total_time / total) * 1000\n", + " throughput_images_per_sec = total / total_time\n", + "\n", + " individual_times = np.array(individual_times)\n", + " avg_img_time_ms = np.mean(individual_times) * 1000\n", + " min_img_time_ms = np.min(individual_times) * 1000\n", + " max_img_time_ms = np.max(individual_times) * 1000\n", + "\n", + " # Computational metrics (same op count as SW baseline)\n", + " ops_fc1 = 784 * 256\n", + " ops_fc2 = 256 * 256\n", + " ops_fc3 = 256 * 10\n", + " total_ops_per_inference = ops_fc1 + ops_fc2 + ops_fc3 # 268,800\n", + " total_ops_executed = total_ops_per_inference * total\n", + " ops_per_second = total_ops_per_inference * throughput_images_per_sec\n", + " gops_per_second = ops_per_second / 1e9\n", + "\n", + " print(\"\\n\" + \"=\" * 70)\n", + " print(\"RESULTS\")\n", + " print(\"=\" * 70)\n", + "\n", + " print(\"\\n1. ACCURACY RESULTS:\")\n", + " print(f\" Total images processed: {total:,}\")\n", + " print(f\" Correct predictions: {correct:,}\")\n", + " print(f\" Incorrect predictions: {total - correct:,}\")\n", + " print(f\" Accuracy: {accuracy:.2f}%\")\n", + "\n", + " print(\"\\n2. TIMING RESULTS:\")\n", + " print(f\" Total execution time: {total_time:.4f} seconds\")\n", + " print(f\" Average time per image: {avg_time_per_image_ms:.4f} ms\")\n", + " print(f\" Throughput: {throughput_images_per_sec:.2f} images/second\")\n", + "\n", + " print(\"\\n3. PER-IMAGE TIMING STATISTICS:\")\n", + " print(f\" Total images: {total:,}\")\n", + " print(f\" Average image time: {avg_img_time_ms:.4f} ms\")\n", + " print(f\" Min image time: {min_img_time_ms:.4f} ms\")\n", + " print(f\" Max image time: {max_img_time_ms:.4f} ms\")\n", + "\n", + " print(\"\\n4. COMPUTATIONAL ANALYSIS:\")\n", + " print(f\" Operations per image: {total_ops_per_inference:,}\")\n", + " print(f\" Total operations: {total_ops_executed:,}\")\n", + " print(f\" Operations per second: {ops_per_second:.2e}\")\n", + " print(f\" Throughput: {gops_per_second:.4f} GOPS\")\n", + "\n", + " results = {\n", + " 'accuracy': {\n", + " 'total_images': total,\n", + " 'correct': correct,\n", + " 'incorrect': total - correct,\n", + " 'accuracy_percent': float(accuracy)\n", + " },\n", + " 'timing': {\n", + " 'total_time_sec': float(total_time),\n", + " 'avg_time_per_image_ms': float(avg_time_per_image_ms),\n", + " 'throughput_images_per_sec': float(throughput_images_per_sec)\n", + " },\n", + " 'per_image_stats': {\n", + " 'avg_time_ms': float(avg_img_time_ms),\n", + " 'min_time_ms': float(min_img_time_ms),\n", + " 'max_time_ms': float(max_img_time_ms)\n", + " },\n", + " 'computational': {\n", + " 'ops_per_image': total_ops_per_inference,\n", + " 'total_ops_executed': total_ops_executed,\n", + " 'ops_per_second': float(ops_per_second),\n", + " 'gops_per_second': float(gops_per_second)\n", + " }\n", + " }\n", + "\n", + " return results\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9d9742d6", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test data: (10000, 784), dtype=uint8\n", + "Labels: (10000,), dtype=int64\n", + "Unique input values: [0 1]\n", + "Warm-up complete\n", + "\n", + "Processing 10,000 test images...\n", + "----------------------------------------------------------------------\n", + " [ 1000/10000] acc so far: 19.00% elapsed: 14.3s\n", + " [ 2000/10000] acc so far: 18.40% elapsed: 28.5s\n", + " [ 3000/10000] acc so far: 17.73% elapsed: 42.8s\n", + " [ 4000/10000] acc so far: 17.62% elapsed: 57.1s\n", + " [ 5000/10000] acc so far: 17.36% elapsed: 71.3s\n", + " [ 6000/10000] acc so far: 17.02% elapsed: 85.5s\n", + " [ 7000/10000] acc so far: 16.73% elapsed: 99.7s\n", + " [ 8000/10000] acc so far: 16.16% elapsed: 113.9s\n", + " [ 9000/10000] acc so far: 16.00% elapsed: 128.1s\n", + " [10000/10000] acc so far: 15.68% elapsed: 142.4s\n", + "\n", + "======================================================================\n", + "RESULTS\n", + "======================================================================\n", + "\n", + "1. ACCURACY RESULTS:\n", + " Total images processed: 10,000\n", + " Correct predictions: 1,568\n", + " Incorrect predictions: 8,432\n", + " Accuracy: 15.68%\n", + "\n", + "2. TIMING RESULTS:\n", + " Total execution time: 142.3650 seconds\n", + " Average time per image: 14.2365 ms\n", + " Throughput: 70.24 images/second\n", + "\n", + "3. PER-IMAGE TIMING STATISTICS:\n", + " Total images: 10,000\n", + " Average image time: 14.2231 ms\n", + " Min image time: 13.9881 ms\n", + " Max image time: 19.3802 ms\n", + "\n", + "4. COMPUTATIONAL ANALYSIS:\n", + " Operations per image: 268,800\n", + " Total operations: 2,688,000,000\n", + " Operations per second: 1.89e+07\n", + " Throughput: 0.0189 GOPS\n" + ] + } + ], + "source": [ + "# Load binarized test data (exported from PC)\n", + "test_inputs = np.load(f\"/home/xilinx/jupyter_notebooks/MNIST/database/test_inputs_binarized.npy\") # (10000, 784) uint8\n", + "test_labels = np.load(f\"/home/xilinx/jupyter_notebooks/MNIST/database/test_labels_full.npy\") # (10000,) int64\n", + "\n", + "print(f\"Test data: {test_inputs.shape}, dtype={test_inputs.dtype}\")\n", + "print(f\"Labels: {test_labels.shape}, dtype={test_labels.dtype}\")\n", + "print(f\"Unique input values: {np.unique(test_inputs)}\")\n", + "\n", + "# Run the testbench\n", + "fpga_results = fpga_hardware_testbench(test_inputs, test_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "857fb949", + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [], + "source": [ + "# Compare results\n", + "sw_cpu_latency_ms = 0.1209\n", + "sw_cpu_throughput = 8269.89\n", + "sw_cpu_gops = 2.2229\n", + "\n", + "sw_gpu_latency_ms = 0.1086\n", + "sw_gpu_throughput = 9209.52\n", + "sw_gpu_gops = 2.4755\n", + "\n", + "fpga_latency_ms = fpga_results['timing']['avg_time_per_image_ms']\n", + "fpga_throughput = fpga_results['timing']['throughput_images_per_sec']\n", + "fpga_gops = fpga_results['computational']['gops_per_second']\n", + "fpga_accuracy = fpga_results['accuracy']['accuracy_percent']\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"FPGA vs SOFTWARE BASELINE COMPARISON\")\n", + "print(\"=\" * 70)\n", + "\n", + "print(f\"\\n{'Platform':<35} {'Latency(ms)':<14} {'Throughput':<16} {'GOPS':<10} {'Accuracy'}\")\n", + "print(\"-\" * 90)\n", + "print(f\"{'PC CPU (PyTorch, x86)':<35} {sw_cpu_latency_ms:<14.4f} {sw_cpu_throughput:<16.2f} {sw_cpu_gops:<10.4f} {'98.50%'}\")\n", + "print(f\"{'PC GPU (PyTorch, RTX 5070 Ti)':<35} {sw_gpu_latency_ms:<14.4f} {sw_gpu_throughput:<16.2f} {sw_gpu_gops:<10.4f} {'98.50%'}\")\n", + "print(f\"{'FPGA (XCZU3EG, this design)':<35} {fpga_latency_ms:<14.4f} {fpga_throughput:<16.2f} {fpga_gops:<10.4f} {fpga_accuracy:.2f}%\")\n", + "\n", + "print(f\"\\nSpeedup vs PC CPU: {sw_cpu_latency_ms / fpga_latency_ms:.2f}x\")\n", + "print(f\"Speedup vs PC GPU: {sw_gpu_latency_ms / fpga_latency_ms:.2f}x\")\n", + "\n", + "print(f\"\\nNote: FPGA latency includes Python overhead + DMA transfers + kernel.\")\n", + "print(f\" Kernel-only latency: ~0.97 us (322 cycles @ 300 MHz)\")\n", + "print(\"=\" * 70)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08c60a03", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# Free buffers\n", + "l1_buf.freebuffer()\n", + "l2_buf.freebuffer()\n", + "l3_buf.freebuffer()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed7a1493", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69f1a273", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "026bf6f2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}