From af5a21d691f231b2b2f98935dbb46b326e6c868c Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 17 Jun 2025 00:52:48 +0100 Subject: [PATCH 01/17] change clang -march flag to -mcpu with fp16 disassembly test --- test/test_disassembly.py | 20 ++++++++++++++++++++ tinygrad/runtime/ops_cpu.py | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 test/test_disassembly.py diff --git a/test/test_disassembly.py b/test/test_disassembly.py new file mode 100644 index 0000000000000..2a0b99f2721c6 --- /dev/null +++ b/test/test_disassembly.py @@ -0,0 +1,20 @@ +import unittest, io, re +from tinygrad import Tensor, dtypes +from contextlib import redirect_stdout +from tinygrad.device import Device +from tinygrad.helpers import OSX +from tinygrad.engine.realize import get_program + +class TestDisassembly(unittest.TestCase): + @unittest.skipUnless(Device.DEFAULT in ("CPU", "LLVM") and OSX, "m series cpus support fp16 arithmetic") + def test_float16_alu(self): + c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16) + s = c.schedule()[-1] + p = get_program(Device[Device.DEFAULT].renderer, s.ast) + lib = Device[Device.DEFAULT].compiler.compile(p.src) + out = io.StringIO() + with redirect_stdout(out): Device[Device.DEFAULT].compiler.disassemble(lib) + assert re.search(r'\bfadd\s+h\d', out.getvalue()) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index c5a15afb52b75..517d620c3d508 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -11,7 +11,7 @@ def compile(self, src:str) -> bytes: # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it target = 'x86_64' if sys.platform == 'win32' else platform.machine() - args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] + args = ['-mcpu=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] arch_args = ['-ffixed-x18'] if target == 'arm64' else [] obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8')) return jit_loader(obj) From 641baa55d0dc63f61dc8c5c59c191bb097eb5df2 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 17 Jun 2025 01:09:44 +0100 Subject: [PATCH 02/17] fix --- tinygrad/runtime/ops_cpu.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index 517d620c3d508..745bf028747ea 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -5,13 +5,14 @@ from tinygrad.renderer.cstyle import ClangRenderer class ClangJITCompiler(Compiler): - def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey) + def __init__(self, cachekey="compile_clang_jit"): super().__init__(None) def compile(self, src:str) -> bytes: # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it target = 'x86_64' if sys.platform == 'win32' else platform.machine() - args = ['-mcpu=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] + arch = '-march=native' if platform.machine() in ('x86_64', 'AMD64') else '-mcpu=native' + args = [arch, f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] arch_args = ['-ffixed-x18'] if target == 'arm64' else [] obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8')) return jit_loader(obj) From 38bbfde91f05becec695dfe60b6392ad247408e4 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 17 Jun 2025 01:25:36 +0100 Subject: [PATCH 03/17] add capstone to macos dependencies --- .github/workflows/test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 20c2a13161095..de561824875fe 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -916,6 +916,7 @@ jobs: with: key: macos-${{ matrix.backend }}-minimal deps: testing_minimal + pydeps: "capstone" llvm: ${{ matrix.backend == 'llvm' && 'true' }} - name: Set env run: printf "${{ matrix.backend == 'llvm' && 'LLVM=1' || matrix.backend == 'cpu' && 'CPU=1' || matrix.backend == 'metal' && 'METAL=1'}}" >> $GITHUB_ENV From d18eb63a6a070a4107ed1b24719b57fabba670ef Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 17 Jun 2025 02:29:15 +0100 Subject: [PATCH 04/17] just check no cast in test --- test/test_disassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_disassembly.py b/test/test_disassembly.py index 2a0b99f2721c6..17aad04f22f23 100644 --- a/test/test_disassembly.py +++ b/test/test_disassembly.py @@ -14,7 +14,7 @@ def test_float16_alu(self): lib = Device[Device.DEFAULT].compiler.compile(p.src) out = io.StringIO() with redirect_stdout(out): Device[Device.DEFAULT].compiler.disassemble(lib) - assert re.search(r'\bfadd\s+h\d', out.getvalue()) + assert "fcvt" not in out.getvalue() if __name__ == "__main__": unittest.main() \ No newline at end of file From eccfbf895e2691b9a0e436c504c7699a5e7737a5 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 17 Jun 2025 02:31:28 +0100 Subject: [PATCH 05/17] rm import --- test/test_disassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_disassembly.py b/test/test_disassembly.py index 17aad04f22f23..b93269ee4613d 100644 --- a/test/test_disassembly.py +++ b/test/test_disassembly.py @@ -1,4 +1,4 @@ -import unittest, io, re +import unittest, io from tinygrad import Tensor, dtypes from contextlib import redirect_stdout from tinygrad.device import Device From 7681c05ef17511701f383cb20f76b1a547f053b3 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 17 Jun 2025 02:52:02 +0100 Subject: [PATCH 06/17] woops --- tinygrad/runtime/ops_cpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index 745bf028747ea..161a3d1e4b106 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -5,7 +5,7 @@ from tinygrad.renderer.cstyle import ClangRenderer class ClangJITCompiler(Compiler): - def __init__(self, cachekey="compile_clang_jit"): super().__init__(None) + def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey) def compile(self, src:str) -> bytes: # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call From bbc36f0dc16f0e269761a3c289e8cab1bc699b83 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 17 Jun 2025 18:21:11 +0100 Subject: [PATCH 07/17] lets check --- .github/workflows/test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index de561824875fe..35c34ac4f5063 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -931,6 +931,9 @@ jobs: - name: Run macOS-specific unit test if: matrix.backend == 'cpu' run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated + - name: print llvm + if: matrix.backend == 'llvm' + run: DEBUG=7 python3 test/test_disassembly.py # ****** Windows Tests ****** From 04e487969d93c905b9754ea53dd4bef1b9adbd1d Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 17 Jun 2025 18:26:52 +0100 Subject: [PATCH 08/17] move check --- .github/workflows/test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 35c34ac4f5063..95cd005a215a9 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -924,6 +924,9 @@ jobs: run: | python -c "from tinygrad import Device; assert Device.DEFAULT == '${{ matrix.backend }}'.upper(), Device.DEFAULT" DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus + - name: print llvm + if: matrix.backend == 'llvm' + run: DEBUG=7 python3 test/test_disassembly.py - name: Run pytest (${{ matrix.backend }}) run: python3 -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --durations=20 - name: Run process replay tests @@ -931,9 +934,6 @@ jobs: - name: Run macOS-specific unit test if: matrix.backend == 'cpu' run: python3 -m pytest test/unit/test_disk_tensor.py::TestDiskTensor::test_copy_to_cpu_not_truncated - - name: print llvm - if: matrix.backend == 'llvm' - run: DEBUG=7 python3 test/test_disassembly.py # ****** Windows Tests ****** From feb2604c1edea1d15d0d325604f902552c8c52b0 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 17 Jun 2025 19:07:45 +0100 Subject: [PATCH 09/17] llvm init before cpu chcek --- tinygrad/runtime/ops_llvm.py | 3 +-- tinygrad/runtime/support/compiler_amd.py | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py index c628ad138b419..3de54fec6faff 100644 --- a/tinygrad/runtime/ops_llvm.py +++ b/tinygrad/runtime/ops_llvm.py @@ -15,8 +15,6 @@ class LLVMCompiler(Compiler): jit = True target_arch = {'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()] def __init__(self, processor:str, feats:str): - for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')() - triple = {'AArch64': b'aarch64-none-unknown-elf', 'X86': b'x86_64-none-unknown-elf', 'AMDGPU': b'amdgcn-amd-amdhsa'}[self.target_arch] target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt) if DEBUG >= 3: print(f"LLVM init for {processor!r} with {feats!r}") @@ -65,6 +63,7 @@ def disassemble(self, lib:bytes): capstone_flatdump(lib) class HostLLVMCompiler(LLVMCompiler): def __init__(self): + for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')() # +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures()) super().__init__(cpu.decode(), feats.decode()) diff --git a/tinygrad/runtime/support/compiler_amd.py b/tinygrad/runtime/support/compiler_amd.py index fd00ea03b90fa..63975514d2243 100644 --- a/tinygrad/runtime/support/compiler_amd.py +++ b/tinygrad/runtime/support/compiler_amd.py @@ -10,6 +10,7 @@ except AttributeError: pass # ignore if ROCm isn't installed from tinygrad.device import Compiler, CompileError from tinygrad.runtime.ops_llvm import LLVMCompiler +import tinygrad.runtime.autogen.llvm as llvm from tinygrad.helpers import OSX, to_char_p_p def amdgpu_disassemble(lib:bytes): @@ -89,6 +90,7 @@ class AMDLLVMCompiler(LLVMCompiler): jit = False target_arch = "AMDGPU" def __init__(self, arch: str): + for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')() self.arch = arch super().__init__(self.arch, "+cumode") def __reduce__(self): return (AMDLLVMCompiler, (self.arch,)) From b494c0229308168422715cce6e690a136a6438d2 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 17 Jun 2025 20:16:03 +0100 Subject: [PATCH 10/17] try this --- autogen_stubs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autogen_stubs.sh b/autogen_stubs.sh index 5c4e684e3d52e..484ead1ad2147 100755 --- a/autogen_stubs.sh +++ b/autogen_stubs.sh @@ -233,14 +233,14 @@ generate_libc() { } generate_llvm() { - INC="$(llvm-config-14 --includedir)" + INC="$(llvm-config --includedir)" clang2py -k cdefstum \ $(find "$INC/llvm-c/" -type f -name '*.h' | sort) \ "$INC/llvm/Config/Targets.def" \ "$INC/llvm/Config/AsmPrinters.def" \ "$INC/llvm/Config/AsmParsers.def" \ "$INC/llvm/Config/Disassemblers.def" \ - --clang-args="$(llvm-config-14 --cflags)" \ + --clang-args="$(llvm-config --cflags)" \ -o "$BASE/llvm.py" sed -i "s\import ctypes\import ctypes, tinygrad.runtime.support.llvm as llvm_support\g" "$BASE/llvm.py" From 536e14d0d693e411e9642155447b912afcecb2ab Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 17 Jun 2025 20:25:02 +0100 Subject: [PATCH 11/17] bump autogen llvm version --- .github/workflows/test.yml | 2 +- autogen_stubs.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 95cd005a215a9..596c8fc6e3a8d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -97,7 +97,7 @@ jobs: webgpu: 'true' llvm: 'true' - name: Install autogen support packages - run: sudo apt-get install -y --no-install-recommends llvm-14-dev libclang-14-dev + run: sudo apt-get install -y --no-install-recommends llvm-18-dev libclang-14-dev - name: Verify OpenCL autogen run: | cp tinygrad/runtime/autogen/opencl.py /tmp/opencl.py.bak diff --git a/autogen_stubs.sh b/autogen_stubs.sh index 484ead1ad2147..0b3918a1d57ca 100755 --- a/autogen_stubs.sh +++ b/autogen_stubs.sh @@ -233,14 +233,14 @@ generate_libc() { } generate_llvm() { - INC="$(llvm-config --includedir)" + INC="$(llvm-config-18 --includedir)" clang2py -k cdefstum \ $(find "$INC/llvm-c/" -type f -name '*.h' | sort) \ "$INC/llvm/Config/Targets.def" \ "$INC/llvm/Config/AsmPrinters.def" \ "$INC/llvm/Config/AsmParsers.def" \ "$INC/llvm/Config/Disassemblers.def" \ - --clang-args="$(llvm-config --cflags)" \ + --clang-args="$(llvm-config-18 --cflags)" \ -o "$BASE/llvm.py" sed -i "s\import ctypes\import ctypes, tinygrad.runtime.support.llvm as llvm_support\g" "$BASE/llvm.py" From 386dd0c9516d6da1e7b15ca85cf434ae8da0d9c6 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 17 Jun 2025 20:39:35 +0100 Subject: [PATCH 12/17] also update libclang? --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 596c8fc6e3a8d..36ac11fc2e7e1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -97,7 +97,7 @@ jobs: webgpu: 'true' llvm: 'true' - name: Install autogen support packages - run: sudo apt-get install -y --no-install-recommends llvm-18-dev libclang-14-dev + run: sudo apt-get install -y --no-install-recommends llvm-18-dev libclang-18-dev - name: Verify OpenCL autogen run: | cp tinygrad/runtime/autogen/opencl.py /tmp/opencl.py.bak From 9f8eebd0122538e3d8ca702b4ceebc23794b2706 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Thu, 19 Jun 2025 18:41:00 +0100 Subject: [PATCH 13/17] revert --- .github/workflows/test.yml | 5 +---- autogen_stubs.sh | 4 ++-- tinygrad/runtime/ops_llvm.py | 3 ++- tinygrad/runtime/support/compiler_amd.py | 1 - 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 36ac11fc2e7e1..de561824875fe 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -97,7 +97,7 @@ jobs: webgpu: 'true' llvm: 'true' - name: Install autogen support packages - run: sudo apt-get install -y --no-install-recommends llvm-18-dev libclang-18-dev + run: sudo apt-get install -y --no-install-recommends llvm-14-dev libclang-14-dev - name: Verify OpenCL autogen run: | cp tinygrad/runtime/autogen/opencl.py /tmp/opencl.py.bak @@ -924,9 +924,6 @@ jobs: run: | python -c "from tinygrad import Device; assert Device.DEFAULT == '${{ matrix.backend }}'.upper(), Device.DEFAULT" DEBUG=4 python3 test/test_tiny.py TestTiny.test_plus - - name: print llvm - if: matrix.backend == 'llvm' - run: DEBUG=7 python3 test/test_disassembly.py - name: Run pytest (${{ matrix.backend }}) run: python3 -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --durations=20 - name: Run process replay tests diff --git a/autogen_stubs.sh b/autogen_stubs.sh index 0b3918a1d57ca..5c4e684e3d52e 100755 --- a/autogen_stubs.sh +++ b/autogen_stubs.sh @@ -233,14 +233,14 @@ generate_libc() { } generate_llvm() { - INC="$(llvm-config-18 --includedir)" + INC="$(llvm-config-14 --includedir)" clang2py -k cdefstum \ $(find "$INC/llvm-c/" -type f -name '*.h' | sort) \ "$INC/llvm/Config/Targets.def" \ "$INC/llvm/Config/AsmPrinters.def" \ "$INC/llvm/Config/AsmParsers.def" \ "$INC/llvm/Config/Disassemblers.def" \ - --clang-args="$(llvm-config-18 --cflags)" \ + --clang-args="$(llvm-config-14 --cflags)" \ -o "$BASE/llvm.py" sed -i "s\import ctypes\import ctypes, tinygrad.runtime.support.llvm as llvm_support\g" "$BASE/llvm.py" diff --git a/tinygrad/runtime/ops_llvm.py b/tinygrad/runtime/ops_llvm.py index 3de54fec6faff..c628ad138b419 100644 --- a/tinygrad/runtime/ops_llvm.py +++ b/tinygrad/runtime/ops_llvm.py @@ -15,6 +15,8 @@ class LLVMCompiler(Compiler): jit = True target_arch = {'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()] def __init__(self, processor:str, feats:str): + for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')() + triple = {'AArch64': b'aarch64-none-unknown-elf', 'X86': b'x86_64-none-unknown-elf', 'AMDGPU': b'amdgcn-amd-amdhsa'}[self.target_arch] target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt) if DEBUG >= 3: print(f"LLVM init for {processor!r} with {feats!r}") @@ -63,7 +65,6 @@ def disassemble(self, lib:bytes): capstone_flatdump(lib) class HostLLVMCompiler(LLVMCompiler): def __init__(self): - for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')() # +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures()) super().__init__(cpu.decode(), feats.decode()) diff --git a/tinygrad/runtime/support/compiler_amd.py b/tinygrad/runtime/support/compiler_amd.py index 63975514d2243..71c5312e6ed98 100644 --- a/tinygrad/runtime/support/compiler_amd.py +++ b/tinygrad/runtime/support/compiler_amd.py @@ -90,7 +90,6 @@ class AMDLLVMCompiler(LLVMCompiler): jit = False target_arch = "AMDGPU" def __init__(self, arch: str): - for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{self.target_arch}{component}')() self.arch = arch super().__init__(self.arch, "+cumode") def __reduce__(self): return (AMDLLVMCompiler, (self.arch,)) From b872e8146efedd15f5feb55bc0f56d908c199703 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Thu, 19 Jun 2025 18:47:52 +0100 Subject: [PATCH 14/17] add comment --- tinygrad/runtime/ops_cpu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tinygrad/runtime/ops_cpu.py b/tinygrad/runtime/ops_cpu.py index 161a3d1e4b106..e172ba3520339 100644 --- a/tinygrad/runtime/ops_cpu.py +++ b/tinygrad/runtime/ops_cpu.py @@ -11,6 +11,7 @@ def compile(self, src:str) -> bytes: # -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call # x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it target = 'x86_64' if sys.platform == 'win32' else platform.machine() + # on arm march means "runs on this arch and superset" instead of "optimize for this arch". x86 march == arm mcpu arch = '-march=native' if platform.machine() in ('x86_64', 'AMD64') else '-mcpu=native' args = [arch, f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident'] arch_args = ['-ffixed-x18'] if target == 'arm64' else [] From 3d2ba9c8d4ba602d79319632c204234b7d487d4c Mon Sep 17 00:00:00 2001 From: ttomsa Date: Thu, 19 Jun 2025 18:52:11 +0100 Subject: [PATCH 15/17] skip llvm test and add comment --- test/test_disassembly.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_disassembly.py b/test/test_disassembly.py index b93269ee4613d..fd78b6e29ec16 100644 --- a/test/test_disassembly.py +++ b/test/test_disassembly.py @@ -6,7 +6,8 @@ from tinygrad.engine.realize import get_program class TestDisassembly(unittest.TestCase): - @unittest.skipUnless(Device.DEFAULT in ("CPU", "LLVM") and OSX, "m series cpus support fp16 arithmetic") + # TODO: fails on llvm. llvm.LLVMGetHostCPUName() returns "generic" + @unittest.skipUnless(Device.DEFAULT in ("CPU",) and OSX, "m series cpus support fp16 arithmetic") def test_float16_alu(self): c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16) s = c.schedule()[-1] From 85282bbaaab3780e5d48588f52d58995fa5f3b48 Mon Sep 17 00:00:00 2001 From: ttomsa Date: Thu, 19 Jun 2025 18:54:30 +0100 Subject: [PATCH 16/17] linter --- tinygrad/runtime/support/compiler_amd.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tinygrad/runtime/support/compiler_amd.py b/tinygrad/runtime/support/compiler_amd.py index 71c5312e6ed98..fd00ea03b90fa 100644 --- a/tinygrad/runtime/support/compiler_amd.py +++ b/tinygrad/runtime/support/compiler_amd.py @@ -10,7 +10,6 @@ except AttributeError: pass # ignore if ROCm isn't installed from tinygrad.device import Compiler, CompileError from tinygrad.runtime.ops_llvm import LLVMCompiler -import tinygrad.runtime.autogen.llvm as llvm from tinygrad.helpers import OSX, to_char_p_p def amdgpu_disassemble(lib:bytes): From b72b70abb75fb485a4d657522d3a7a562ad8a20e Mon Sep 17 00:00:00 2001 From: ttomsa Date: Tue, 24 Jun 2025 23:46:01 +0100 Subject: [PATCH 17/17] update test --- test/test_disassembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_disassembly.py b/test/test_disassembly.py index fd78b6e29ec16..e908b83710ad1 100644 --- a/test/test_disassembly.py +++ b/test/test_disassembly.py @@ -11,7 +11,7 @@ class TestDisassembly(unittest.TestCase): def test_float16_alu(self): c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16) s = c.schedule()[-1] - p = get_program(Device[Device.DEFAULT].renderer, s.ast) + p = get_program(s.ast, Device[Device.DEFAULT].renderer) lib = Device[Device.DEFAULT].compiler.compile(p.src) out = io.StringIO() with redirect_stdout(out): Device[Device.DEFAULT].compiler.disassemble(lib)