From 98e55897525e7f59c9b622659f9a7c2600e4ce9f Mon Sep 17 00:00:00 2001
From: GuoningHuang <2985186238@qq.com>
Date: Wed, 25 Feb 2026 20:17:39 +0800
Subject: [PATCH 1/2] [Examples] add Python AOT runner for BuddyDeepSeekR1
 subgraph end-to-end inference

---
 examples/BuddyDeepSeekR1/README.md            |  21 +-
 .../BuddyDeepSeekR1/run-subgraphs-python.py   | 391 ++++++++++++++++++
 2 files changed, 411 insertions(+), 1 deletion(-)
 create mode 100644 examples/BuddyDeepSeekR1/run-subgraphs-python.py

diff --git a/examples/BuddyDeepSeekR1/README.md b/examples/BuddyDeepSeekR1/README.md
index b3937cfb74..aa18cac7d4 100644
--- a/examples/BuddyDeepSeekR1/README.md
+++ b/examples/BuddyDeepSeekR1/README.md
@@ -116,7 +116,26 @@ $ ./bin/buddy-deepseek-r1-cli --interactive --no-stats
 - Use options like `--max-tokens` and `--eos-id` to constrain the generation length/termination. Add `--no-stats` when you want pure text output without the performance summary.
 - `--interactive` starts a REPL similar to `buddy-deepseek-r1-main.cpp`, handling one prompt and one response at a time. `--prompt` can act as a system prefix prepended to every user entry.
 
-6. Enjoy it!
+6. Python end-to-end inference by executing exported subgraphs
+
+This path keeps graph export in Buddy import flow, then uses Python to run prefill/decode MLIR subgraphs end to end.
+
+```bash
+# from buddy-mlir/
+python3 examples/BuddyDeepSeekR1/run-subgraphs-python.py \
+  --prompt "Hello, who are you?" \
+  --export-subgraphs
+```
+
+Useful options:
+
+- `--artifact-dir`: directory containing `forward_prefill.mlir`, `forward_decode.mlir`, `arg0.data` (default: `build/examples/BuddyDeepSeekR1`).
+- `--llvm-build-dir`: LLVM build directory that provides `libmlir_runner_utils.so` and related runtime libs (default: `llvm/build`).
+- `--model-path`: HF model id or local model path for tokenizer.
+- `--max-new-tokens`: generated token upper bound.
+- `--omp-num-threads`, `--omp-proc-bind`, `--omp-places`, `--kmp-affinity`: optional OpenMP tuning knobs.
+
+7. Enjoy it!
 
 ## How to run on RISC-V machine
 
diff --git a/examples/BuddyDeepSeekR1/run-subgraphs-python.py b/examples/BuddyDeepSeekR1/run-subgraphs-python.py
new file mode 100644
index 0000000000..9cbe8d13cf
--- /dev/null
+++ b/examples/BuddyDeepSeekR1/run-subgraphs-python.py
@@ -0,0 +1,391 @@
+#!/usr/bin/env python3
+# ===- run-subgraphs-python.py -----------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===
+
+import argparse
+import ctypes
+import os
+import subprocess
+import time
+from pathlib import Path
+
+import numpy as np
+
+KV_COUNT = 56
+MAX_TOKEN_LENGTH = 1024
+EOS_TOKEN_ID = 151643
+PARAM_SIZE = 1777088064
+
+rt = None
+
+
+def _init_mlir_bindings():
+    global rt
+    from mlir import runtime as _rt
+
+    rt = _rt
+
+
+def _build_forward_output_descriptor():
+    class ForwardOutputDescriptor(ctypes.Structure):
+        _fields_ = [
+            (str(i), rt.make_nd_memref_descriptor(4, rt.as_ctype(np.float32)))
+            for i in range(KV_COUNT)
+        ] + [
+            (
+                str(KV_COUNT),
+                rt.make_nd_memref_descriptor(3, rt.as_ctype(np.float32)),
+            )
+        ]
+
+    return ForwardOutputDescriptor
+
+
+def _memref_desc(array: np.ndarray):
+    return rt.get_ranked_memref_descriptor(array)
+
+
+def _memref_to_numpy(desc):
+    return rt.ranked_memref_to_numpy(ctypes.pointer(desc))
+
+
+def _sample_next_token(
+    logits: np.ndarray,
+    temperature: float,
+    top_p: float,
+    rng: np.random.Generator,
+) -> int:
+    if temperature <= 0:
+        return int(np.argmax(logits))
+
+    scaled = logits.astype(np.float64) / temperature
+    scaled -= np.max(scaled)
+    probs = np.exp(scaled)
+    probs_sum = probs.sum()
+    if probs_sum <= 0:
+        return int(np.argmax(logits))
+    probs /= probs_sum
+
+    if 0 < top_p < 1.0:
+        sorted_idx = np.argsort(-probs)
+        sorted_probs = probs[sorted_idx]
+        cum = np.cumsum(sorted_probs)
+        keep_count = int(np.searchsorted(cum, top_p, side="left")) + 1
+        keep_count = max(1, keep_count)
+        keep_idx = sorted_idx[:keep_count]
+        keep_probs = probs[keep_idx]
+        keep_probs /= keep_probs.sum()
+        return int(rng.choice(keep_idx, p=keep_probs))
+
+    return int(rng.choice(len(probs), p=probs))
+
+
+def _export_artifacts(example_dir: Path, output_dir: Path):
+    cmd = [
+        "python3",
+        str(example_dir / "import-deepseek-r1.py"),
+        "--output-dir",
+        str(output_dir),
+    ]
+    subprocess.check_call(cmd)
+
+
+def _resolve_path(path_arg: Path, repo_root: Path) -> Path:
+    path_arg = path_arg.expanduser()
+    if path_arg.is_absolute():
+        return path_arg.resolve()
+    cwd_path = (Path.cwd() / path_arg).resolve()
+    if cwd_path.exists():
+        return cwd_path
+    return (repo_root / path_arg).resolve()
+
+
+def _apply_omp_env(args):
+    if args.omp_num_threads is not None:
+        os.environ["OMP_NUM_THREADS"] = str(args.omp_num_threads)
+    if args.omp_proc_bind is not None:
+        os.environ["OMP_PROC_BIND"] = args.omp_proc_bind
+    if args.omp_places is not None:
+        os.environ["OMP_PLACES"] = args.omp_places
+    if args.kmp_affinity is not None:
+        os.environ["KMP_AFFINITY"] = args.kmp_affinity
+
+
+def _ensure_aot_runtime_lib(artifact_dir: Path) -> Path:
+    so_path = artifact_dir / "libdeepseek_forward_runtime.so"
+    if so_path.exists():
+        return so_path
+
+    objs = [
+        artifact_dir / "forward_prefill.o",
+        artifact_dir / "forward_decode.o",
+        artifact_dir / "subgraph_prefill.o",
+        artifact_dir / "subgraph_decode.o",
+    ]
+    missing = [str(p) for p in objs if not p.exists()]
+    if missing:
+        raise FileNotFoundError(
+            "Missing object files required for AOT runtime:\n  "
+            + "\n  ".join(missing)
+        )
+
+    cmd = ["clang++", "-shared", *[str(p) for p in objs], "-o", str(so_path)]
+    subprocess.check_call(cmd)
+    return so_path
+
+
+def _run_aot(
+    artifact_dir: Path,
+    llvm_build_dir: Path,
+    tokenizer,
+    params: np.ndarray,
+    prefill_input: np.ndarray,
+    token_cnt: int,
+    eos_id: int,
+    args,
+    rng: np.random.Generator,
+):
+    runtime_so = _ensure_aot_runtime_lib(artifact_dir)
+
+    libdir = llvm_build_dir / "lib"
+    rtld_global = getattr(os, "RTLD_GLOBAL", 0)
+    for so in [
+        libdir / "libomp.so",
+        libdir / "libmlir_c_runner_utils.so",
+        libdir / "libmlir_runner_utils.so",
+    ]:
+        ctypes.CDLL(str(so), mode=rtld_global)
+    runtime_lib = ctypes.CDLL(str(runtime_so), mode=rtld_global)
+
+    prefill_func = runtime_lib._mlir_ciface_forward_prefill
+    decode_func = runtime_lib._mlir_ciface_forward_decode
+
+    ForwardOutputDescriptor = _build_forward_output_descriptor()
+
+    params_desc = _memref_desc(params)
+    prefill_desc = _memref_desc(prefill_input)
+    decode_input = np.zeros((1, 1), dtype=np.int64)
+    decode_input_desc = _memref_desc(decode_input)
+    cache_pos = np.zeros((1,), dtype=np.int64)
+    cache_pos_desc = _memref_desc(cache_pos)
+
+    state_in = ForwardOutputDescriptor()
+    state_out = ForwardOutputDescriptor()
+
+    print("[Python] prefill...")
+    t0 = time.time()
+    prefill_func(
+        ctypes.byref(state_in),
+        ctypes.byref(params_desc),
+        ctypes.byref(prefill_desc),
+    )
+    prefill_s = time.time() - t0
+
+    prefill_logits = _memref_to_numpy(getattr(state_in, str(KV_COUNT)))
+    next_token = _sample_next_token(
+        prefill_logits[0, token_cnt - 1, :], args.temperature, args.top_p, rng
+    )
+
+    generated = [next_token]
+    print(
+        tokenizer.decode(
+            [next_token],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        ),
+        end="",
+        flush=True,
+    )
+
+    decode_time = 0.0
+    for _ in range(args.max_new_tokens - 1):
+        if token_cnt >= MAX_TOKEN_LENGTH or next_token == eos_id:
+            break
+
+        decode_input[0, 0] = next_token
+        cache_pos[0] = token_cnt
+
+        decode_args = [
+            ctypes.byref(state_out),
+            ctypes.byref(params_desc),
+            ctypes.byref(decode_input_desc),
+            ctypes.byref(cache_pos_desc),
+        ]
+        for i in range(KV_COUNT):
+            decode_args.append(ctypes.byref(getattr(state_in, str(i))))
+
+        t0 = time.time()
+        decode_func(*decode_args)
+        decode_time += time.time() - t0
+
+        logits = _memref_to_numpy(getattr(state_out, str(KV_COUNT)))
+        next_token = _sample_next_token(
+            logits[0, 0, :], args.temperature, args.top_p, rng
+        )
+        generated.append(next_token)
+        token_cnt += 1
+
+        if next_token == eos_id:
+            break
+        print(
+            tokenizer.decode(
+                [next_token],
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            ),
+            end="",
+            flush=True,
+        )
+
+        state_in, state_out = state_out, state_in
+
+    print("\n")
+    print(f"[Python] prefill time: {prefill_s:.3f}s")
+    if len(generated) > 1:
+        print(
+            f"[Python] decode speed: {(len(generated)-1)/max(decode_time,1e-9):.2f} tok/s"
+        )
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Python end-to-end inference by executing exported Buddy subgraphs"
+    )
+    parser.add_argument("--prompt", type=str, required=True)
+    parser.add_argument(
+        "--artifact-dir",
+        type=Path,
+        default=Path("build/examples/BuddyDeepSeekR1"),
+        help="Directory containing forward_prefill.mlir/forward_decode.mlir/arg0.data",
+    )
+    parser.add_argument(
+        "--llvm-build-dir",
+        type=Path,
+        default=Path("llvm/build"),
+        help="LLVM build dir that provides MLIR runtime shared libs",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default=os.environ.get(
+            "DEEPSEEKR1_MODEL_PATH", "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+        ),
+        help="HF model id or local model path (for tokenizer)",
+    )
+    parser.add_argument("--max-new-tokens", type=int, default=64)
+    parser.add_argument("--eos-id", type=int, default=None)
+    parser.add_argument("--temperature", type=float, default=0.7)
+    parser.add_argument("--top-p", type=float, default=0.9)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument(
+        "--system-prompt", type=str, default="You are a helpful assistant."
+    )
+    parser.add_argument(
+        "--no-chat-template",
+        action="store_true",
+        help="Disable tokenizer chat template and use raw --prompt encoding.",
+    )
+    parser.add_argument(
+        "--export-subgraphs",
+        action="store_true",
+        help="Regenerate forward/subgraph MLIR and arg0.data before running",
+    )
+    parser.add_argument("--omp-num-threads", type=int, default=None)
+    parser.add_argument("--omp-proc-bind", type=str, default=None)
+    parser.add_argument("--omp-places", type=str, default=None)
+    parser.add_argument("--kmp-affinity", type=str, default=None)
+    args = parser.parse_args()
+
+    from transformers import AutoTokenizer
+
+    _init_mlir_bindings()
+    _apply_omp_env(args)
+
+    repo_root = Path(__file__).resolve().parents[2]
+    example_dir = Path(__file__).resolve().parent
+    artifact_dir = _resolve_path(args.artifact_dir, repo_root)
+    llvm_build_dir = _resolve_path(args.llvm_build_dir, repo_root)
+
+    artifact_dir.mkdir(parents=True, exist_ok=True)
+    if args.export_subgraphs:
+        _export_artifacts(example_dir, artifact_dir)
+
+    param_file = artifact_dir / "arg0.data"
+
+    missing = [
+        str(p)
+        for p in [
+            artifact_dir / "forward_prefill.mlir",
+            artifact_dir / "forward_decode.mlir",
+            param_file,
+        ]
+        if not p.exists()
+    ]
+    if missing:
+        raise FileNotFoundError(
+            "Missing artifacts:\n  "
+            + "\n  ".join(missing)
+            + "\nRun with --export-subgraphs first."
+        )
+
+    params = np.fromfile(param_file, dtype=np.float32)
+    if params.size != PARAM_SIZE:
+        raise ValueError(
+            f"Unexpected param size: {params.size}, expect {PARAM_SIZE}"
+        )
+    params = params.reshape((PARAM_SIZE,))
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+    if not args.no_chat_template and hasattr(tokenizer, "apply_chat_template"):
+        messages = []
+        if args.system_prompt:
+            messages.append({"role": "system", "content": args.system_prompt})
+        messages.append({"role": "user", "content": args.prompt})
+        input_ids = tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+        )
+    else:
+        input_ids = tokenizer.encode(args.prompt, add_special_tokens=True)
+
+    if not input_ids:
+        raise ValueError("Prompt tokenization produced empty input.")
+    input_ids = input_ids[:MAX_TOKEN_LENGTH]
+    token_cnt = len(input_ids)
+
+    eos_id = (
+        args.eos_id
+        if args.eos_id is not None
+        else (
+            int(tokenizer.eos_token_id)
+            if tokenizer.eos_token_id is not None
+            else EOS_TOKEN_ID
+        )
+    )
+    rng = np.random.default_rng(args.seed)
+
+    prefill_input = np.zeros((1, MAX_TOKEN_LENGTH), dtype=np.int64)
+    prefill_input[0, :token_cnt] = np.asarray(input_ids, dtype=np.int64)
+
+    print("[Python] runtime: aot")
+    _run_aot(
+        artifact_dir,
+        llvm_build_dir,
+        tokenizer,
+        params,
+        prefill_input,
+        token_cnt,
+        eos_id,
+        args,
+        rng,
+    )
+
+
+if __name__ == "__main__":
+    main()

From 1d074375f79861e782b6e778022e55fef4a8535f Mon Sep 17 00:00:00 2001
From: GuoningHuang <2985186238@qq.com>
Date: Sun, 1 Mar 2026 17:35:51 +0800
Subject: [PATCH 2/2] [Examples] add Python AOT runner for BuddyDeepSeekR1
 subgraph end-to-end inference

---
 examples/BuddyDeepSeekR1/README.md            | 16 +++++++++++----
 .../BuddyDeepSeekR1/run-subgraphs-python.py   | 20 +------------------
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/examples/BuddyDeepSeekR1/README.md b/examples/BuddyDeepSeekR1/README.md
index aa18cac7d4..4033e5bec1 100644
--- a/examples/BuddyDeepSeekR1/README.md
+++ b/examples/BuddyDeepSeekR1/README.md
@@ -118,13 +118,21 @@ $ ./bin/buddy-deepseek-r1-cli --interactive --no-stats
 
 6. Python end-to-end inference by executing exported subgraphs
 
-This path keeps graph export in Buddy import flow, then uses Python to run prefill/decode MLIR subgraphs end to end.
+This path reuses artifacts already generated under `build/examples/BuddyDeepSeekR1`, then uses Python to run prefill/decode MLIR subgraphs end to end.
 
 ```bash
-# from buddy-mlir/
-python3 examples/BuddyDeepSeekR1/run-subgraphs-python.py \
+# from buddy-mlir/examples/BuddyDeepSeekR1
+python3 run-subgraphs-python.py \
   --prompt "Hello, who are you?" \
-  --export-subgraphs
+  --artifact-dir path_to_buddyDeepSeekR1_build \
+  --llvm-build-dir path_to_llvm_build
+```
+
+Example:
+
+```bash
+# from buddy-mlir/examples/BuddyDeepSeekR1
+python3 run-subgraphs-python.py --prompt "hello" --artifact-dir ../../build/examples/BuddyDeepSeekR1 --llvm-build-dir ../../llvm/build --omp-num-threads 48 --omp-proc-bind close
 ```
 
 Useful options:
diff --git a/examples/BuddyDeepSeekR1/run-subgraphs-python.py b/examples/BuddyDeepSeekR1/run-subgraphs-python.py
index 9cbe8d13cf..9fdc1786d3 100644
--- a/examples/BuddyDeepSeekR1/run-subgraphs-python.py
+++ b/examples/BuddyDeepSeekR1/run-subgraphs-python.py
@@ -85,16 +85,6 @@ def _sample_next_token(
     return int(rng.choice(len(probs), p=probs))
 
 
-def _export_artifacts(example_dir: Path, output_dir: Path):
-    cmd = [
-        "python3",
-        str(example_dir / "import-deepseek-r1.py"),
-        "--output-dir",
-        str(output_dir),
-    ]
-    subprocess.check_call(cmd)
-
-
 def _resolve_path(path_arg: Path, repo_root: Path) -> Path:
     path_arg = path_arg.expanduser()
     if path_arg.is_absolute():
@@ -290,11 +280,6 @@ def main():
         action="store_true",
         help="Disable tokenizer chat template and use raw --prompt encoding.",
     )
-    parser.add_argument(
-        "--export-subgraphs",
-        action="store_true",
-        help="Regenerate forward/subgraph MLIR and arg0.data before running",
-    )
     parser.add_argument("--omp-num-threads", type=int, default=None)
     parser.add_argument("--omp-proc-bind", type=str, default=None)
     parser.add_argument("--omp-places", type=str, default=None)
@@ -307,13 +292,10 @@ def main():
     _apply_omp_env(args)
 
     repo_root = Path(__file__).resolve().parents[2]
-    example_dir = Path(__file__).resolve().parent
     artifact_dir = _resolve_path(args.artifact_dir, repo_root)
     llvm_build_dir = _resolve_path(args.llvm_build_dir, repo_root)
 
     artifact_dir.mkdir(parents=True, exist_ok=True)
-    if args.export_subgraphs:
-        _export_artifacts(example_dir, artifact_dir)
 
     param_file = artifact_dir / "arg0.data"
 
@@ -330,7 +312,7 @@ def main():
         raise FileNotFoundError(
             "Missing artifacts:\n  "
             + "\n  ".join(missing)
-            + "\nRun with --export-subgraphs first."
+            + "\nPlease generate them under --artifact-dir before running."
         )
 
     params = np.fromfile(param_file, dtype=np.float32)