From af5a21d691f231b2b2f98935dbb46b326e6c868c Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 17 Jun 2025 00:52:48 +0100
Subject: [PATCH 01/17] change clang -march flag to -mcpu with fp16 disassembly
 test

---
 test/test_disassembly.py    | 20 ++++++++++++++++++++
 tinygrad/runtime/ops_cpu.py |  2 +-
 2 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 test/test_disassembly.py

diff --git a/test/test_disassembly.py b/test/test_disassembly.py
new file mode 100644
index 0000000000000..2a0b99f2721c6
--- /dev/null
+++ b/test/test_disassembly.py
@@ -0,0 +1,20 @@
+import unittest, io, re
+from tinygrad import Tensor, dtypes
+from contextlib import redirect_stdout
+from tinygrad.device import Device
+from tinygrad.helpers import OSX
+from tinygrad.engine.realize import get_program
+
+class TestDisassembly(unittest.TestCase):
+  @unittest.skipUnless(Device.DEFAULT in ("CPU", "LLVM") and OSX, "m series cpus support fp16 arithmetic")
+  def test_float16_alu(self):
+    c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16)
+    s = c.schedule()[-1]
+    p = get_program(Device[Device.DEFAULT].renderer, s.ast)
+    lib = Device[Device.DEFAULT].compiler.compile(p.src)
+    out = io.StringIO()
+    with redirect_stdout(out): Device[Device.DEFAULT].compiler.disassemble(lib)
+    assert re.search(r'\bfadd\s+h\d', out.getvalue())
+
+if __name__ == "__main__":
+  unittest.main()
\ No newline at end of file
diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
index c5a15afb52b75..517d620c3d508 100644
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -11,7 +11,7 @@ def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
     # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
     target = 'x86_64' if sys.platform == 'win32' else platform.machine()
-    args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
+    args = ['-mcpu=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
     arch_args = ['-ffixed-x18'] if target == 'arm64' else []
     obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
     return jit_loader(obj)

From 641baa55d0dc63f61dc8c5c59c191bb097eb5df2 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 17 Jun 2025 01:09:44 +0100
Subject: [PATCH 02/17] fix

---
 tinygrad/runtime/ops_cpu.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
index 517d620c3d508..745bf028747ea 100644
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -5,13 +5,14 @@
 from tinygrad.renderer.cstyle import ClangRenderer
 
 class ClangJITCompiler(Compiler):
-  def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
+  def __init__(self, cachekey="compile_clang_jit"): super().__init__(None)
 
   def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
     # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
     target = 'x86_64' if sys.platform == 'win32' else platform.machine()
-    args = ['-mcpu=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
+    arch = '-march=native' if platform.machine() in ('x86_64', 'AMD64') else '-mcpu=native'
+    args = [arch, f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
     arch_args = ['-ffixed-x18'] if target == 'arm64' else []
     obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
     return jit_loader(obj)

From 38bbfde91f05becec695dfe60b6392ad247408e4 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 17 Jun 2025 01:25:36 +0100
Subject: [PATCH 03/17] add capstone to macos dependencies

---
 .github/workflows/test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 20c2a13161095..de561824875fe 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -916,6 +916,7 @@ jobs:
         with:
           key: macos-${{ matrix.backend }}-minimal
           deps: testing_minimal
+          pydeps: "capstone"
           llvm: ${{ matrix.backend == 'llvm' && 'true' }}
       - name: Set env
         run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'metal' && 'METAL=1'}}" >> $GITHUB_ENV

From d18eb63a6a070a4107ed1b24719b57fabba670ef Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 17 Jun 2025 02:29:15 +0100
Subject: [PATCH 04/17] just check no cast in test

---
 test/test_disassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_disassembly.py b/test/test_disassembly.py
index 2a0b99f2721c6..17aad04f22f23 100644
--- a/test/test_disassembly.py
+++ b/test/test_disassembly.py
@@ -14,7 +14,7 @@ def test_float16_alu(self):
     lib = Device[Device.DEFAULT].compiler.compile(p.src)
     out = io.StringIO()
     with redirect_stdout(out): Device[Device.DEFAULT].compiler.disassemble(lib)
-    assert re.search(r'\bfadd\s+h\d', out.getvalue())
+    assert "fcvt" not in out.getvalue()
 
 if __name__ == "__main__":
   unittest.main()
\ No newline at end of file

From eccfbf895e2691b9a0e436c504c7699a5e7737a5 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 17 Jun 2025 02:31:28 +0100
Subject: [PATCH 05/17] rm import

---
 test/test_disassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_disassembly.py b/test/test_disassembly.py
index 17aad04f22f23..b93269ee4613d 100644
--- a/test/test_disassembly.py
+++ b/test/test_disassembly.py
@@ -1,4 +1,4 @@
-import unittest, io, re
+import unittest, io
 from tinygrad import Tensor, dtypes
 from contextlib import redirect_stdout
 from tinygrad.device import Device

From 7681c05ef17511701f383cb20f76b1a547f053b3 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 17 Jun 2025 02:52:02 +0100
Subject: [PATCH 06/17] woops

---
 tinygrad/runtime/ops_cpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
index 745bf028747ea..161a3d1e4b106 100644
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -5,7 +5,7 @@
 from tinygrad.renderer.cstyle import ClangRenderer
 
 class ClangJITCompiler(Compiler):
-  def __init__(self, cachekey="compile_clang_jit"): super().__init__(None)
+  def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
 
   def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call

From bbc36f0dc16f0e269761a3c289e8cab1bc699b83 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 17 Jun 2025 18:21:11 +0100
Subject: [PATCH 07/17] lets check

---
 .github/workflows/test.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index de561824875fe..35c34ac4f5063 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -931,6 +931,9 @@ jobs:
       - name: Run macOS-specific unit test
         if: matrix.backend == 'cpu'
         run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated
+      - name: print llvm
+        if: matrix.backend == 'llvm'
+        run: DEBUG=7 python3 test/test_disassembly.py
 
 # ****** Windows Tests ******
 

From 04e487969d93c905b9754ea53dd4bef1b9adbd1d Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 17 Jun 2025 18:26:52 +0100
Subject: [PATCH 08/17] move check

---
 .github/workflows/test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 35c34ac4f5063..95cd005a215a9 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -924,6 +924,9 @@ jobs:
         run: |
           python -c "from tinygrad import Device; assert Device.DEFAULT == '${{ matrix.backend }}'.upper(), Device.DEFAULT"
           DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus
+      - name: print llvm
+        if: matrix.backend == 'llvm'
+        run: DEBUG=7 python3 test/test_disassembly.py
       - name: Run pytest (${{ matrix.backend }})
         run: python3 -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --durations=20
       - name: Run process replay tests
@@ -931,9 +934,6 @@ jobs:
       - name: Run macOS-specific unit test
         if: matrix.backend == 'cpu'
         run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated
-      - name: print llvm
-        if: matrix.backend == 'llvm'
-        run: DEBUG=7 python3 test/test_disassembly.py
 
 # ****** Windows Tests ******
 

From feb2604c1edea1d15d0d325604f902552c8c52b0 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 17 Jun 2025 19:07:45 +0100
Subject: [PATCH 09/17] llvm init before cpu chcek

---
 tinygrad/runtime/ops_llvm.py             | 3 +--
 tinygrad/runtime/support/compiler_amd.py | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py
index c628ad138b419..3de54fec6faff 100644
--- a/tinygrad/runtime/ops_llvm.py
+++ b/tinygrad/runtime/ops_llvm.py
@@ -15,8 +15,6 @@ class LLVMCompiler(Compiler):
   jit = True
   target_arch = {'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()]
   def __init__(self, processor:str, feats:str):
-    for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')()
-
     triple = {'AArch64': b'aarch64-none-unknown-elf', 'X86': b'x86_64-none-unknown-elf', 'AMDGPU': b'amdgcn-amd-amdhsa'}[self.target_arch]
     target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt)
     if DEBUG >= 3: print(f"LLVM init for {processor!r} with {feats!r}")
@@ -65,6 +63,7 @@ def disassemble(self, lib:bytes): capstone_flatdump(lib)
 
 class HostLLVMCompiler(LLVMCompiler):
   def __init__(self):
+    for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')()
     # +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx
     cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures())
     super().__init__(cpu.decode(), feats.decode())
diff --git a/tinygrad/runtime/support/compiler_amd.py b/tinygrad/runtime/support/compiler_amd.py
index fd00ea03b90fa..63975514d2243 100644
--- a/tinygrad/runtime/support/compiler_amd.py
+++ b/tinygrad/runtime/support/compiler_amd.py
@@ -10,6 +10,7 @@
 except AttributeError: pass  # ignore if ROCm isn't installed
 from tinygrad.device import Compiler, CompileError
 from tinygrad.runtime.ops_llvm import LLVMCompiler
+import tinygrad.runtime.autogen.llvm as llvm
 from tinygrad.helpers import OSX, to_char_p_p
 
 def amdgpu_disassemble(lib:bytes):
@@ -89,6 +90,7 @@ class AMDLLVMCompiler(LLVMCompiler):
   jit = False
   target_arch = "AMDGPU"
   def __init__(self, arch: str):
+    for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')()
     self.arch = arch
     super().__init__(self.arch, "+cumode")
   def __reduce__(self): return (AMDLLVMCompiler, (self.arch,))

From b494c0229308168422715cce6e690a136a6438d2 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 17 Jun 2025 20:16:03 +0100
Subject: [PATCH 10/17] try this

---
 autogen_stubs.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/autogen_stubs.sh b/autogen_stubs.sh
index 5c4e684e3d52e..484ead1ad2147 100755
--- a/autogen_stubs.sh
+++ b/autogen_stubs.sh
@@ -233,14 +233,14 @@ generate_libc() {
 }
 
 generate_llvm() {
-  INC="$(llvm-config-14 --includedir)"
+  INC="$(llvm-config --includedir)"
   clang2py -k cdefstum \
     $(find "$INC/llvm-c/" -type f -name '*.h' | sort) \
     "$INC/llvm/Config/Targets.def" \
     "$INC/llvm/Config/AsmPrinters.def" \
     "$INC/llvm/Config/AsmParsers.def" \
     "$INC/llvm/Config/Disassemblers.def" \
-    --clang-args="$(llvm-config-14 --cflags)" \
+    --clang-args="$(llvm-config --cflags)" \
     -o "$BASE/llvm.py"
 
   sed -i "s\import ctypes\import ctypes, tinygrad.runtime.support.llvm as llvm_support\g" "$BASE/llvm.py"

From 536e14d0d693e411e9642155447b912afcecb2ab Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 17 Jun 2025 20:25:02 +0100
Subject: [PATCH 11/17] bump autogen llvm version

---
 .github/workflows/test.yml | 2 +-
 autogen_stubs.sh           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 95cd005a215a9..596c8fc6e3a8d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -97,7 +97,7 @@ jobs:
         webgpu: 'true'
         llvm: 'true'
     - name: Install autogen support packages
-      run: sudo apt-get install -y --no-install-recommends llvm-14-dev libclang-14-dev
+      run: sudo apt-get install -y --no-install-recommends llvm-18-dev libclang-14-dev
     - name: Verify OpenCL autogen
       run: |
         cp tinygrad/runtime/autogen/opencl.py /tmp/opencl.py.bak
diff --git a/autogen_stubs.sh b/autogen_stubs.sh
index 484ead1ad2147..0b3918a1d57ca 100755
--- a/autogen_stubs.sh
+++ b/autogen_stubs.sh
@@ -233,14 +233,14 @@ generate_libc() {
 }
 
 generate_llvm() {
-  INC="$(llvm-config --includedir)"
+  INC="$(llvm-config-18 --includedir)"
   clang2py -k cdefstum \
     $(find "$INC/llvm-c/" -type f -name '*.h' | sort) \
     "$INC/llvm/Config/Targets.def" \
     "$INC/llvm/Config/AsmPrinters.def" \
     "$INC/llvm/Config/AsmParsers.def" \
     "$INC/llvm/Config/Disassemblers.def" \
-    --clang-args="$(llvm-config --cflags)" \
+    --clang-args="$(llvm-config-18 --cflags)" \
     -o "$BASE/llvm.py"
 
   sed -i "s\import ctypes\import ctypes, tinygrad.runtime.support.llvm as llvm_support\g" "$BASE/llvm.py"

From 386dd0c9516d6da1e7b15ca85cf434ae8da0d9c6 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 17 Jun 2025 20:39:35 +0100
Subject: [PATCH 12/17] also update libclang?

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 596c8fc6e3a8d..36ac11fc2e7e1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -97,7 +97,7 @@ jobs:
         webgpu: 'true'
         llvm: 'true'
     - name: Install autogen support packages
-      run: sudo apt-get install -y --no-install-recommends llvm-18-dev libclang-14-dev
+      run: sudo apt-get install -y --no-install-recommends llvm-18-dev libclang-18-dev
     - name: Verify OpenCL autogen
       run: |
         cp tinygrad/runtime/autogen/opencl.py /tmp/opencl.py.bak

From 9f8eebd0122538e3d8ca702b4ceebc23794b2706 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Thu, 19 Jun 2025 18:41:00 +0100
Subject: [PATCH 13/17] revert

---
 .github/workflows/test.yml               | 5 +----
 autogen_stubs.sh                         | 4 ++--
 tinygrad/runtime/ops_llvm.py             | 3 ++-
 tinygrad/runtime/support/compiler_amd.py | 1 -
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 36ac11fc2e7e1..de561824875fe 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -97,7 +97,7 @@ jobs:
         webgpu: 'true'
         llvm: 'true'
     - name: Install autogen support packages
-      run: sudo apt-get install -y --no-install-recommends llvm-18-dev libclang-18-dev
+      run: sudo apt-get install -y --no-install-recommends llvm-14-dev libclang-14-dev
     - name: Verify OpenCL autogen
       run: |
         cp tinygrad/runtime/autogen/opencl.py /tmp/opencl.py.bak
@@ -924,9 +924,6 @@ jobs:
         run: |
           python -c "from tinygrad import Device; assert Device.DEFAULT == '${{ matrix.backend }}'.upper(), Device.DEFAULT"
           DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus
-      - name: print llvm
-        if: matrix.backend == 'llvm'
-        run: DEBUG=7 python3 test/test_disassembly.py
       - name: Run pytest (${{ matrix.backend }})
         run: python3 -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --durations=20
       - name: Run process replay tests
diff --git a/autogen_stubs.sh b/autogen_stubs.sh
index 0b3918a1d57ca..5c4e684e3d52e 100755
--- a/autogen_stubs.sh
+++ b/autogen_stubs.sh
@@ -233,14 +233,14 @@ generate_libc() {
 }
 
 generate_llvm() {
-  INC="$(llvm-config-18 --includedir)"
+  INC="$(llvm-config-14 --includedir)"
   clang2py -k cdefstum \
     $(find "$INC/llvm-c/" -type f -name '*.h' | sort) \
     "$INC/llvm/Config/Targets.def" \
     "$INC/llvm/Config/AsmPrinters.def" \
     "$INC/llvm/Config/AsmParsers.def" \
     "$INC/llvm/Config/Disassemblers.def" \
-    --clang-args="$(llvm-config-18 --cflags)" \
+    --clang-args="$(llvm-config-14 --cflags)" \
     -o "$BASE/llvm.py"
 
   sed -i "s\import ctypes\import ctypes, tinygrad.runtime.support.llvm as llvm_support\g" "$BASE/llvm.py"
diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py
index 3de54fec6faff..c628ad138b419 100644
--- a/tinygrad/runtime/ops_llvm.py
+++ b/tinygrad/runtime/ops_llvm.py
@@ -15,6 +15,8 @@ class LLVMCompiler(Compiler):
   jit = True
   target_arch = {'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()]
   def __init__(self, processor:str, feats:str):
+    for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')()
+
     triple = {'AArch64': b'aarch64-none-unknown-elf', 'X86': b'x86_64-none-unknown-elf', 'AMDGPU': b'amdgcn-amd-amdhsa'}[self.target_arch]
     target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt)
     if DEBUG >= 3: print(f"LLVM init for {processor!r} with {feats!r}")
@@ -63,7 +65,6 @@ def disassemble(self, lib:bytes): capstone_flatdump(lib)
 
 class HostLLVMCompiler(LLVMCompiler):
   def __init__(self):
-    for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')()
     # +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx
     cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures())
     super().__init__(cpu.decode(), feats.decode())
diff --git a/tinygrad/runtime/support/compiler_amd.py b/tinygrad/runtime/support/compiler_amd.py
index 63975514d2243..71c5312e6ed98 100644
--- a/tinygrad/runtime/support/compiler_amd.py
+++ b/tinygrad/runtime/support/compiler_amd.py
@@ -90,7 +90,6 @@ class AMDLLVMCompiler(LLVMCompiler):
   jit = False
   target_arch = "AMDGPU"
   def __init__(self, arch: str):
-    for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')()
     self.arch = arch
     super().__init__(self.arch, "+cumode")
   def __reduce__(self): return (AMDLLVMCompiler, (self.arch,))

From b872e8146efedd15f5feb55bc0f56d908c199703 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Thu, 19 Jun 2025 18:47:52 +0100
Subject: [PATCH 14/17] add comment

---
 tinygrad/runtime/ops_cpu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py
index 161a3d1e4b106..e172ba3520339 100644
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@@ -11,6 +11,7 @@ def compile(self, src:str) -> bytes:
     # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
     # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
     target = 'x86_64' if sys.platform == 'win32' else platform.machine()
+    # on arm march means "runs on this arch and superset" instead of "optimize for this arch". x86 march == arm mcpu
     arch = '-march=native' if platform.machine() in ('x86_64', 'AMD64') else '-mcpu=native'
     args = [arch, f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
     arch_args = ['-ffixed-x18'] if target == 'arm64' else []

From 3d2ba9c8d4ba602d79319632c204234b7d487d4c Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Thu, 19 Jun 2025 18:52:11 +0100
Subject: [PATCH 15/17] skip llvm test and add comment

---
 test/test_disassembly.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_disassembly.py b/test/test_disassembly.py
index b93269ee4613d..fd78b6e29ec16 100644
--- a/test/test_disassembly.py
+++ b/test/test_disassembly.py
@@ -6,7 +6,8 @@
 from tinygrad.engine.realize import get_program
 
 class TestDisassembly(unittest.TestCase):
-  @unittest.skipUnless(Device.DEFAULT in ("CPU", "LLVM") and OSX, "m series cpus support fp16 arithmetic")
+  # TODO: fails on llvm. llvm.LLVMGetHostCPUName() returns "generic"
+  @unittest.skipUnless(Device.DEFAULT in ("CPU",) and OSX, "m series cpus support fp16 arithmetic")
   def test_float16_alu(self):
     c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16)
     s = c.schedule()[-1]

From 85282bbaaab3780e5d48588f52d58995fa5f3b48 Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Thu, 19 Jun 2025 18:54:30 +0100
Subject: [PATCH 16/17] linter

---
 tinygrad/runtime/support/compiler_amd.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tinygrad/runtime/support/compiler_amd.py b/tinygrad/runtime/support/compiler_amd.py
index 71c5312e6ed98..fd00ea03b90fa 100644
--- a/tinygrad/runtime/support/compiler_amd.py
+++ b/tinygrad/runtime/support/compiler_amd.py
@@ -10,7 +10,6 @@
 except AttributeError: pass  # ignore if ROCm isn't installed
 from tinygrad.device import Compiler, CompileError
 from tinygrad.runtime.ops_llvm import LLVMCompiler
-import tinygrad.runtime.autogen.llvm as llvm
 from tinygrad.helpers import OSX, to_char_p_p
 
 def amdgpu_disassemble(lib:bytes):

From b72b70abb75fb485a4d657522d3a7a562ad8a20e Mon Sep 17 00:00:00 2001
From: ttomsa <tomasvsilva8@gmail.com>
Date: Tue, 24 Jun 2025 23:46:01 +0100
Subject: [PATCH 17/17] update test

---
 test/test_disassembly.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_disassembly.py b/test/test_disassembly.py
index fd78b6e29ec16..e908b83710ad1 100644
--- a/test/test_disassembly.py
+++ b/test/test_disassembly.py
@@ -11,7 +11,7 @@ class TestDisassembly(unittest.TestCase):
   def test_float16_alu(self):
     c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16)
     s = c.schedule()[-1]
-    p = get_program(Device[Device.DEFAULT].renderer, s.ast)
+    p = get_program(s.ast, Device[Device.DEFAULT].renderer)
     lib = Device[Device.DEFAULT].compiler.compile(p.src)
     out = io.StringIO()
     with redirect_stdout(out): Device[Device.DEFAULT].compiler.disassemble(lib)