From 0204ca339510fc53ae41f2ca439c39dd48cafa45 Mon Sep 17 00:00:00 2001
From: cparish312 <54302513+cparish312@users.noreply.github.com>
Date: Mon, 4 Nov 2024 16:19:23 -0500
Subject: [PATCH 1/3] change packaging to paths to com-connor to work
---
README-sycl.md | 573 -
build.zig | 172 -
codecov.yml | 14 -
convert-hf-to-gguf.py | 2908 -
convert-llama-ggml-to-gguf.py | 441 -
convert-lora-to-ggml.py | 148 -
convert-persimmon-to-gguf.py | 138 -
convert.py | 1555 -
ggml-alloc.c | 985 -
ggml-alloc.h | 76 -
ggml-backend-impl.h | 141 -
ggml-backend.c | 2101 -
ggml-backend.h | 233 -
ggml-common.h | 1853 -
ggml-cuda.cu | 2756 -
ggml-cuda.h | 43 -
ggml-cuda/acc.cu | 47 -
ggml-cuda/acc.cuh | 5 -
ggml-cuda/alibi.cu | 63 -
ggml-cuda/alibi.cuh | 5 -
ggml-cuda/arange.cu | 34 -
ggml-cuda/arange.cuh | 5 -
ggml-cuda/argsort.cu | 103 -
ggml-cuda/argsort.cuh | 3 -
ggml-cuda/binbcast.cu | 280 -
ggml-cuda/binbcast.cuh | 6 -
ggml-cuda/clamp.cu | 35 -
ggml-cuda/clamp.cuh | 5 -
ggml-cuda/common.cuh | 551 -
ggml-cuda/concat.cu | 49 -
ggml-cuda/concat.cuh | 5 -
ggml-cuda/convert.cu | 826 -
ggml-cuda/convert.cuh | 13 -
ggml-cuda/cpy.cu | 461 -
ggml-cuda/cpy.cuh | 7 -
ggml-cuda/dequantize.cuh | 103 -
ggml-cuda/diagmask.cu | 40 -
ggml-cuda/diagmask.cuh | 5 -
ggml-cuda/dmmv.cu | 813 -
ggml-cuda/dmmv.cuh | 18 -
ggml-cuda/getrows.cu | 178 -
ggml-cuda/getrows.cuh | 5 -
ggml-cuda/im2col.cu | 104 -
ggml-cuda/im2col.cuh | 5 -
ggml-cuda/mmq.cu | 2265 -
ggml-cuda/mmq.cuh | 9 -
ggml-cuda/mmvq.cu | 406 -
ggml-cuda/mmvq.cuh | 7 -
ggml-cuda/norm.cu | 215 -
ggml-cuda/norm.cuh | 7 -
ggml-cuda/pad.cu | 49 -
ggml-cuda/pad.cuh | 5 -
ggml-cuda/pool2d.cu | 94 -
ggml-cuda/pool2d.cuh | 5 -
ggml-cuda/quantize.cu | 45 -
ggml-cuda/quantize.cuh | 5 -
ggml-cuda/rope.cu | 308 -
ggml-cuda/rope.cuh | 5 -
ggml-cuda/scale.cu | 32 -
ggml-cuda/scale.cuh | 5 -
ggml-cuda/softmax.cu | 201 -
ggml-cuda/softmax.cuh | 5 -
ggml-cuda/sumrows.cu | 40 -
ggml-cuda/sumrows.cuh | 3 -
ggml-cuda/tsembd.cu | 47 -
ggml-cuda/tsembd.cuh | 5 -
ggml-cuda/unary.cu | 240 -
ggml-cuda/unary.cuh | 27 -
ggml-cuda/upscale.cu | 48 -
ggml-cuda/upscale.cuh | 5 -
ggml-cuda/vecdotq.cuh | 1280 -
ggml-impl.h | 523 -
ggml-kompute.cpp | 2006 -
ggml-kompute.h | 46 -
ggml-metal.h | 66 -
ggml-metal.m | 2999 -
ggml-metal.metal | 6246 --
ggml-mpi.c | 216 -
ggml-mpi.h | 39 -
ggml-opencl.cpp | 2301 -
ggml-opencl.h | 36 -
ggml-quants.c | 12669 ----
ggml-quants.h | 133 -
ggml-sycl.cpp | 17888 ------
ggml-sycl.h | 49 -
ggml-vulkan-shaders.hpp | 69849 ----------------------
ggml-vulkan.cpp | 6442 --
ggml-vulkan.h | 29 -
ggml.c | 21830 -------
ggml.h | 2407 -
ggml_vk_generate_shaders.py | 2777 -
kompute | 1 -
kompute-shaders/common.comp | 102 -
kompute-shaders/op_add.comp | 58 -
kompute-shaders/op_addrow.comp | 25 -
kompute-shaders/op_cpy_f16_f16.comp | 52 -
kompute-shaders/op_cpy_f16_f32.comp | 52 -
kompute-shaders/op_cpy_f32_f16.comp | 52 -
kompute-shaders/op_cpy_f32_f32.comp | 52 -
kompute-shaders/op_diagmask.comp | 30 -
kompute-shaders/op_gelu.comp | 22 -
kompute-shaders/op_getrows.comp | 17 -
kompute-shaders/op_getrows_f16.comp | 31 -
kompute-shaders/op_getrows_q4_0.comp | 38 -
kompute-shaders/op_getrows_q4_1.comp | 39 -
kompute-shaders/op_getrows_q6_k.comp | 44 -
kompute-shaders/op_mul.comp | 52 -
kompute-shaders/op_mul_mat_f16.comp | 67 -
kompute-shaders/op_mul_mat_mat_f32.comp | 51 -
kompute-shaders/op_mul_mat_q4_0.comp | 33 -
kompute-shaders/op_mul_mat_q4_1.comp | 35 -
kompute-shaders/op_mul_mat_q6_k.comp | 94 -
kompute-shaders/op_mul_mat_q8_0.comp | 73 -
kompute-shaders/op_mul_mv_q_n.comp | 48 -
kompute-shaders/op_mul_mv_q_n_pre.comp | 22 -
kompute-shaders/op_norm.comp | 84 -
kompute-shaders/op_relu.comp | 21 -
kompute-shaders/op_rmsnorm.comp | 53 -
kompute-shaders/op_rope_f16.comp | 73 -
kompute-shaders/op_rope_f32.comp | 73 -
kompute-shaders/op_scale.comp | 19 -
kompute-shaders/op_scale_8.comp | 23 -
kompute-shaders/op_silu.comp | 22 -
kompute-shaders/op_softmax.comp | 56 -
kompute-shaders/rope_common.comp | 67 -
llama.cpp | 17816 ------
llama.h | 1130 -
sgemm.cpp | 995 -
sgemm.h | 14 -
unicode-data.cpp | 1651 -
unicode-data.h | 16 -
unicode.cpp | 277 -
unicode.h | 28 -
133 files changed, 195333 deletions(-)
delete mode 100644 README-sycl.md
delete mode 100644 build.zig
delete mode 100644 codecov.yml
delete mode 100755 convert-hf-to-gguf.py
delete mode 100755 convert-llama-ggml-to-gguf.py
delete mode 100755 convert-lora-to-ggml.py
delete mode 100755 convert-persimmon-to-gguf.py
delete mode 100755 convert.py
delete mode 100644 ggml-alloc.c
delete mode 100644 ggml-alloc.h
delete mode 100644 ggml-backend-impl.h
delete mode 100644 ggml-backend.c
delete mode 100644 ggml-backend.h
delete mode 100644 ggml-common.h
delete mode 100644 ggml-cuda.cu
delete mode 100644 ggml-cuda.h
delete mode 100644 ggml-cuda/acc.cu
delete mode 100644 ggml-cuda/acc.cuh
delete mode 100644 ggml-cuda/alibi.cu
delete mode 100644 ggml-cuda/alibi.cuh
delete mode 100644 ggml-cuda/arange.cu
delete mode 100644 ggml-cuda/arange.cuh
delete mode 100644 ggml-cuda/argsort.cu
delete mode 100644 ggml-cuda/argsort.cuh
delete mode 100644 ggml-cuda/binbcast.cu
delete mode 100644 ggml-cuda/binbcast.cuh
delete mode 100644 ggml-cuda/clamp.cu
delete mode 100644 ggml-cuda/clamp.cuh
delete mode 100644 ggml-cuda/common.cuh
delete mode 100644 ggml-cuda/concat.cu
delete mode 100644 ggml-cuda/concat.cuh
delete mode 100644 ggml-cuda/convert.cu
delete mode 100644 ggml-cuda/convert.cuh
delete mode 100644 ggml-cuda/cpy.cu
delete mode 100644 ggml-cuda/cpy.cuh
delete mode 100644 ggml-cuda/dequantize.cuh
delete mode 100644 ggml-cuda/diagmask.cu
delete mode 100644 ggml-cuda/diagmask.cuh
delete mode 100644 ggml-cuda/dmmv.cu
delete mode 100644 ggml-cuda/dmmv.cuh
delete mode 100644 ggml-cuda/getrows.cu
delete mode 100644 ggml-cuda/getrows.cuh
delete mode 100644 ggml-cuda/im2col.cu
delete mode 100644 ggml-cuda/im2col.cuh
delete mode 100644 ggml-cuda/mmq.cu
delete mode 100644 ggml-cuda/mmq.cuh
delete mode 100644 ggml-cuda/mmvq.cu
delete mode 100644 ggml-cuda/mmvq.cuh
delete mode 100644 ggml-cuda/norm.cu
delete mode 100644 ggml-cuda/norm.cuh
delete mode 100644 ggml-cuda/pad.cu
delete mode 100644 ggml-cuda/pad.cuh
delete mode 100644 ggml-cuda/pool2d.cu
delete mode 100644 ggml-cuda/pool2d.cuh
delete mode 100644 ggml-cuda/quantize.cu
delete mode 100644 ggml-cuda/quantize.cuh
delete mode 100644 ggml-cuda/rope.cu
delete mode 100644 ggml-cuda/rope.cuh
delete mode 100644 ggml-cuda/scale.cu
delete mode 100644 ggml-cuda/scale.cuh
delete mode 100644 ggml-cuda/softmax.cu
delete mode 100644 ggml-cuda/softmax.cuh
delete mode 100644 ggml-cuda/sumrows.cu
delete mode 100644 ggml-cuda/sumrows.cuh
delete mode 100644 ggml-cuda/tsembd.cu
delete mode 100644 ggml-cuda/tsembd.cuh
delete mode 100644 ggml-cuda/unary.cu
delete mode 100644 ggml-cuda/unary.cuh
delete mode 100644 ggml-cuda/upscale.cu
delete mode 100644 ggml-cuda/upscale.cuh
delete mode 100644 ggml-cuda/vecdotq.cuh
delete mode 100644 ggml-impl.h
delete mode 100644 ggml-kompute.cpp
delete mode 100644 ggml-kompute.h
delete mode 100644 ggml-metal.h
delete mode 100644 ggml-metal.m
delete mode 100644 ggml-metal.metal
delete mode 100644 ggml-mpi.c
delete mode 100644 ggml-mpi.h
delete mode 100644 ggml-opencl.cpp
delete mode 100644 ggml-opencl.h
delete mode 100644 ggml-quants.c
delete mode 100644 ggml-quants.h
delete mode 100644 ggml-sycl.cpp
delete mode 100644 ggml-sycl.h
delete mode 100644 ggml-vulkan-shaders.hpp
delete mode 100644 ggml-vulkan.cpp
delete mode 100644 ggml-vulkan.h
delete mode 100644 ggml.c
delete mode 100644 ggml.h
delete mode 100644 ggml_vk_generate_shaders.py
delete mode 160000 kompute
delete mode 100644 kompute-shaders/common.comp
delete mode 100644 kompute-shaders/op_add.comp
delete mode 100644 kompute-shaders/op_addrow.comp
delete mode 100644 kompute-shaders/op_cpy_f16_f16.comp
delete mode 100644 kompute-shaders/op_cpy_f16_f32.comp
delete mode 100644 kompute-shaders/op_cpy_f32_f16.comp
delete mode 100644 kompute-shaders/op_cpy_f32_f32.comp
delete mode 100644 kompute-shaders/op_diagmask.comp
delete mode 100644 kompute-shaders/op_gelu.comp
delete mode 100644 kompute-shaders/op_getrows.comp
delete mode 100644 kompute-shaders/op_getrows_f16.comp
delete mode 100644 kompute-shaders/op_getrows_q4_0.comp
delete mode 100644 kompute-shaders/op_getrows_q4_1.comp
delete mode 100644 kompute-shaders/op_getrows_q6_k.comp
delete mode 100644 kompute-shaders/op_mul.comp
delete mode 100644 kompute-shaders/op_mul_mat_f16.comp
delete mode 100644 kompute-shaders/op_mul_mat_mat_f32.comp
delete mode 100644 kompute-shaders/op_mul_mat_q4_0.comp
delete mode 100644 kompute-shaders/op_mul_mat_q4_1.comp
delete mode 100644 kompute-shaders/op_mul_mat_q6_k.comp
delete mode 100644 kompute-shaders/op_mul_mat_q8_0.comp
delete mode 100644 kompute-shaders/op_mul_mv_q_n.comp
delete mode 100644 kompute-shaders/op_mul_mv_q_n_pre.comp
delete mode 100644 kompute-shaders/op_norm.comp
delete mode 100644 kompute-shaders/op_relu.comp
delete mode 100644 kompute-shaders/op_rmsnorm.comp
delete mode 100644 kompute-shaders/op_rope_f16.comp
delete mode 100644 kompute-shaders/op_rope_f32.comp
delete mode 100644 kompute-shaders/op_scale.comp
delete mode 100644 kompute-shaders/op_scale_8.comp
delete mode 100644 kompute-shaders/op_silu.comp
delete mode 100644 kompute-shaders/op_softmax.comp
delete mode 100644 kompute-shaders/rope_common.comp
delete mode 100644 llama.cpp
delete mode 100644 llama.h
delete mode 100644 sgemm.cpp
delete mode 100644 sgemm.h
delete mode 100644 unicode-data.cpp
delete mode 100644 unicode-data.h
delete mode 100644 unicode.cpp
delete mode 100644 unicode.h
diff --git a/README-sycl.md b/README-sycl.md
deleted file mode 100644
index dc98c7b3ebd..00000000000
--- a/README-sycl.md
+++ /dev/null
@@ -1,573 +0,0 @@
-# llama.cpp for SYCL
-
-- [Background](#background)
-- [News](#news)
-- [OS](#os)
-- [Hardware](#hardware)
-- [Docker](#docker)
-- [Linux](#linux)
-- [Windows](#windows)
-- [Environment Variable](#environment-variable)
-- [Known Issue](#known-issues)
-- [Q&A](#qa)
-- [TODO](#todo)
-
-## Background
-
-**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17.
-
-**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
-
-- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
-- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*.
-- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
-- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
-
-### Llama.cpp + SYCL
-
-The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
-
-When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
-
-It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
-
-## News
-
-- 2024.4
- - Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M.
-
-- 2024.3
- - Release binary files of Windows.
- - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
- - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
- - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
- - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
- - Support detecting all GPUs with level-zero and same top **Max compute units**.
- - Support OPs
- - hardsigmoid
- - hardswish
- - pool2d
-
-- 2024.1
- - Create SYCL backend for Intel GPU.
- - Support Windows build
-
-## OS
-
-| OS | Status | Verified |
-|---------|---------|------------------------------------|
-| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
-| Windows | Support | Windows 11 |
-
-
-## Hardware
-
-### Intel GPU
-
-**Verified devices**
-
-| Intel GPU | Status | Verified Model |
-|-------------------------------|---------|---------------------------------------|
-| Intel Data Center Max Series | Support | Max 1550, 1100 |
-| Intel Data Center Flex Series | Support | Flex 170 |
-| Intel Arc Series | Support | Arc 770, 730M |
-| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
-| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
-
-*Notes:*
-
-- **Memory**
- - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`.
-
- - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU.
-
-- **Execution Unit (EU)**
- - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use.
-
-### Other Vendor GPU
-
-**Verified devices**
-
-| Nvidia GPU | Status | Verified Model |
-|--------------------------|---------|----------------|
-| Ampere Series | Support | A100, A4000 |
-| Ampere Series *(Mobile)* | Support | RTX 40 Series |
-
-## Docker
-The docker build option is currently limited to *intel GPU* targets.
-
-### Build image
-```sh
-# Using FP16
-docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
-```
-
-*Notes*:
-
-To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
-
-You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative.
-
-### Run container
-
-```sh
-# First, find all the DRI cards
-ls -la /dev/dri
-# Then, pick the card that you want to use (here for e.g. /dev/dri/card1).
-docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
-```
-
-*Notes:*
-- Docker has been tested successfully on native Linux. WSL support has not been verified yet.
-- You may need to install Intel GPU driver on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*.
-
-## Linux
-
-### I. Setup Environment
-
-1. **Install GPU drivers**
-
- - **Intel GPU**
-
-Intel data center GPUs drivers installation guide and download page can be found here: [Get intel dGPU Drivers](https://dgpu-docs.intel.com/driver/installation.html#ubuntu-install-steps).
-
-*Note*: for client GPUs *(iGPU & Arc A-Series)*, please refer to the [client iGPU driver installation](https://dgpu-docs.intel.com/driver/client/overview.html).
-
-Once installed, add the user(s) to the `video` and `render` groups.
-
-```sh
-sudo usermod -aG render $USER
-sudo usermod -aG video $USER
-```
-
-*Note*: logout/re-login for the changes to take effect.
-
-Verify installation through `clinfo`:
-
-```sh
-sudo apt install clinfo
-sudo clinfo -l
-```
-
-Sample output:
-
-```sh
-Platform #0: Intel(R) OpenCL Graphics
- `-- Device #0: Intel(R) Arc(TM) A770 Graphics
-
-Platform #0: Intel(R) OpenCL HD Graphics
- `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
-```
-
-- **Nvidia GPU**
-
-In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
-
-2. **Install Intel® oneAPI Base toolkit**
-
-- **For Intel GPU**
-
-The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
-
-Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*.
-
-Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
-
-Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs.
-
-- **Adding support to Nvidia GPUs**
-
-**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
-
-
-**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
-
-```sh
-git clone https://github.com/oneapi-src/oneMKL
-cd oneMKL
-mkdir -p buildWithCublas && cd buildWithCublas
-cmake ../ -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
-make
-```
-
-
-3. **Verify installation and environment**
-
-In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
-```sh
-source /opt/intel/oneapi/setvars.sh
-sycl-ls
-```
-
-- **Intel GPU**
-
-When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:
-
-```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
-[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
-[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
-[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
-```
-
-- **Nvidia GPU**
-
-Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
-```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
-[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
-[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
-```
-
-### II. Build llama.cpp
-
-#### Intel GPU
-```sh
-# Export relevant ENV variables
-source /opt/intel/oneapi/setvars.sh
-
-# Build LLAMA with MKL BLAS acceleration for intel GPU
-mkdir -p build && cd build
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Option 2: Use FP16
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
-
-#build all binary
-cmake --build . --config Release -j -v
-```
-
-#### Nvidia GPU
-```sh
-# Export relevant ENV variables
-export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
-export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
-
-# Build LLAMA with Nvidia BLAS acceleration through SYCL
-mkdir -p build && cd build
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-# Option 2: Use FP16
-cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
-
-#build all binary
-cmake --build . --config Release -j -v
-
-```
-
-### III. Run the inference
-
-1. Retrieve and prepare model
-
-You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
-
-2. Enable oneAPI running environment
-
-```sh
-source /opt/intel/oneapi/setvars.sh
-```
-
-3. List devices information
-
-Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
-
-```sh
-./build/bin/ls-sycl-device
-```
-A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
-```
-found 6 SYCL devices:
-| | | |Compute |Max compute|Max work|Max sub| |
-|ID| Device Type| Name|capability|units |group |group |Global mem size|
-|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
-| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
-| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
-| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
-| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
-| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
-| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
-```
-
-| Attribute | Note |
-|------------------------|-------------------------------------------------------------|
-| compute capability 1.3 | Level-zero driver/runtime, recommended |
-| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
-
-4. Launch inference
-
-There are two device selection modes:
-
-- Single device: Use one device target specified by the user.
-- Multiple devices: Automatically select the devices with the same largest Max compute-units.
-
-| Device selection | Parameter |
-|------------------|----------------------------------------|
-| Single device | --split-mode none --main-gpu DEVICE_ID |
-| Multiple devices | --split-mode layer (default) |
-
-Examples:
-
-- Use device 0:
-
-```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0
-```
-or run by script:
-
-```sh
-./examples/sycl/run_llama2.sh 0
-```
-
-- Use multiple devices:
-
-```sh
-ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer
-```
-
-Otherwise, you can run the script:
-
-```sh
-./examples/sycl/run_llama2.sh
-```
-
-*Notes:*
-
-- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
-
-```sh
-detect 1 SYCL GPUs: [0] with top Max compute units:512
-```
-Or
-```sh
-use 1 SYCL GPUs: [0] with Max compute units:512
-```
-
-## Windows
-
-### I. Setup Environment
-
-1. Install GPU driver
-
-Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
-
-2. Install Visual Studio
-
-If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/).
-
-3. Install Intel® oneAPI Base toolkit
-
-The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page.
-
-Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*.
-
-Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable.
-
-b. Enable oneAPI running environment:
-
-- Type "oneAPI" in the search bar, then open the `Intel oneAPI command prompt for Intel 64 for Visual Studio 2022` App.
-
-- On the command prompt, enable the runtime environment with the following:
-```
-"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
-```
-
-c. Verify installation
-
-In the oneAPI command line, run the following to print the available SYCL devices:
-
-```
-sycl-ls
-```
-
-There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device:
-
-Output (example):
-```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
-[opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
-[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO [31.0.101.5186]
-[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
-```
-
-4. Install build tools
-
-a. Download & install cmake for Windows: https://cmake.org/download/
-
-b. Download & install mingw-w64 make for Windows provided by w64devkit
-
-- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
-
-- Extract `w64devkit` on your pc.
-
-- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
-
-### II. Build llama.cpp
-
-On the oneAPI command line window, step into the llama.cpp main directory and run the following:
-
-```
-mkdir -p build
-cd build
-@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
-
-# Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
-
-# Option 2: Or FP16
-cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
-
-make -j
-```
-
-Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions:
-```sh
-.\examples\sycl\win-build-sycl.bat
-```
-
-*Notes:*
-
-- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`.
-
-### III. Run the inference
-
-1. Retrieve and prepare model
-
-You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example.
-
-2. Enable oneAPI running environment
-
-On the oneAPI command line window, run the following and step into the llama.cpp directory:
-```
-"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
-```
-
-3. List devices information
-
-Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
-
-```
-build\bin\ls-sycl-device.exe
-```
-
-The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
-```
-found 6 SYCL devices:
-| | | |Compute |Max compute|Max work|Max sub| |
-|ID| Device Type| Name|capability|units |group |group |Global mem size|
-|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
-| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136|
-| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216|
-| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136|
-| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216|
-| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616|
-| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616|
-
-```
-
-| Attribute | Note |
-|------------------------|-----------------------------------------------------------|
-| compute capability 1.3 | Level-zero running time, recommended |
-| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
-
-
-4. Launch inference
-
-There are two device selection modes:
-
-- Single device: Use one device assigned by user.
-- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
-
-| Device selection | Parameter |
-|------------------|----------------------------------------|
-| Single device | --split-mode none --main-gpu DEVICE_ID |
-| Multiple devices | --split-mode layer (default) |
-
-Examples:
-
-- Use device 0:
-
-```
-build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0
-```
-
-- Use multiple devices:
-
-```
-build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer
-```
-Otherwise, run the following wrapper script:
-
-```
-.\examples\sycl\win-run-llama2.bat
-```
-
-Note:
-
-- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow:
-
-```sh
-detect 1 SYCL GPUs: [0] with top Max compute units:512
-```
-Or
-```sh
-use 1 SYCL GPUs: [0] with Max compute units:512
-```
-
-## Environment Variable
-
-#### Build
-
-| Name | Value | Function |
-|--------------------|-----------------------------------|---------------------------------------------|
-| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
-| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
-| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
-| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
-| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
-
-#### Runtime
-
-| Name | Value | Function |
-|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
-| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
-| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer |
-
-## Known Issues
-
-- `Split-mode:[row]` is not supported.
-
-## Q&A
-
-- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
-
- - Potential cause: Unavailable oneAPI installation or not set ENV variables.
- - Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`.
-
-- General compiler error:
-
- - Remove **build** folder or try a clean-build.
-
-- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
-
- Please double-check with `sudo sycl-ls`.
-
- If it's present in the list, please add video/render group to your user then **logout/login** or restart your system:
-
- ```
- sudo usermod -aG render $USER
- sudo usermod -aG video $USER
- ```
- Otherwise, please double-check the GPU driver installation steps.
-
-### **GitHub contribution**:
-Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay.
-
-## TODO
-
-- Support row layer split for multiple card runs.
diff --git a/build.zig b/build.zig
deleted file mode 100644
index 96783574fe7..00000000000
--- a/build.zig
+++ /dev/null
@@ -1,172 +0,0 @@
-// Compatible with Zig Version 0.11.0
-const std = @import("std");
-const ArrayList = std.ArrayList;
-const Compile = std.Build.Step.Compile;
-const ConfigHeader = std.Build.Step.ConfigHeader;
-const Mode = std.builtin.Mode;
-const CrossTarget = std.zig.CrossTarget;
-
-const Maker = struct {
- builder: *std.build.Builder,
- target: CrossTarget,
- optimize: Mode,
- enable_lto: bool,
-
- include_dirs: ArrayList([]const u8),
- cflags: ArrayList([]const u8),
- cxxflags: ArrayList([]const u8),
- objs: ArrayList(*Compile),
-
- fn addInclude(m: *Maker, dir: []const u8) !void {
- try m.include_dirs.append(dir);
- }
- fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
- try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
- }
- fn addCFlag(m: *Maker, flag: []const u8) !void {
- try m.cflags.append(flag);
- }
- fn addCxxFlag(m: *Maker, flag: []const u8) !void {
- try m.cxxflags.append(flag);
- }
- fn addFlag(m: *Maker, flag: []const u8) !void {
- try m.addCFlag(flag);
- try m.addCxxFlag(flag);
- }
-
- fn init(builder: *std.build.Builder) !Maker {
- const target = builder.standardTargetOptions(.{});
- const zig_version = @import("builtin").zig_version_string;
- const commit_hash = try std.ChildProcess.exec(
- .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
- );
- try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
- \\int LLAMA_BUILD_NUMBER = {};
- \\char const *LLAMA_COMMIT = "{s}";
- \\char const *LLAMA_COMPILER = "Zig {s}";
- \\char const *LLAMA_BUILD_TARGET = "{s}";
- \\
- , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
- var m = Maker{
- .builder = builder,
- .target = target,
- .optimize = builder.standardOptimizeOption(.{}),
- .enable_lto = false,
- .include_dirs = ArrayList([]const u8).init(builder.allocator),
- .cflags = ArrayList([]const u8).init(builder.allocator),
- .cxxflags = ArrayList([]const u8).init(builder.allocator),
- .objs = ArrayList(*Compile).init(builder.allocator),
- };
-
- try m.addCFlag("-std=c11");
- try m.addCxxFlag("-std=c++11");
- try m.addProjectInclude(&.{});
- try m.addProjectInclude(&.{"common"});
- return m;
- }
-
- fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
- const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
- if (o.target.getAbi() != .msvc)
- o.defineCMacro("_GNU_SOURCE", null);
-
- if (std.mem.endsWith(u8, src, ".c")) {
- o.addCSourceFiles(&.{src}, m.cflags.items);
- o.linkLibC();
- } else {
- o.addCSourceFiles(&.{src}, m.cxxflags.items);
- if (o.target.getAbi() == .msvc) {
- o.linkLibC(); // need winsdk + crt
- } else {
- // linkLibCpp already add (libc++ + libunwind + libc)
- o.linkLibCpp();
- }
- }
- for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
- o.want_lto = m.enable_lto;
- return o;
- }
-
- fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
- const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
- e.addCSourceFiles(&.{src}, m.cxxflags.items);
- for (deps) |d| e.addObject(d);
- for (m.objs.items) |o| e.addObject(o);
- for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
-
- // https://github.com/ziglang/zig/issues/15448
- if (e.target.getAbi() == .msvc) {
- e.linkLibC(); // need winsdk + crt
- } else {
- // linkLibCpp already add (libc++ + libunwind + libc)
- e.linkLibCpp();
- }
- m.builder.installArtifact(e);
- e.want_lto = m.enable_lto;
- return e;
- }
-};
-
-pub fn build(b: *std.build.Builder) !void {
- var make = try Maker.init(b);
- make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
-
- const ggml = make.obj("ggml", "ggml.c");
- const sgemm = make.obj("sgemm", "sgemm.cpp");
- const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
- const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
- const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
- const unicode = make.obj("unicode", "unicode.cpp");
- const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
- const llama = make.obj("llama", "llama.cpp");
- const buildinfo = make.obj("common", "common/build-info.cpp");
- const common = make.obj("common", "common/common.cpp");
- const console = make.obj("console", "common/console.cpp");
- const sampling = make.obj("sampling", "common/sampling.cpp");
- const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
- const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
- const train = make.obj("train", "common/train.cpp");
- const clip = make.obj("clip", "examples/llava/clip.cpp");
- const llava = make.obj("llava", "examples/llava/llava.cpp");
-
- _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
- _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
- _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
- _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
- _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
- _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
-
- const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
- if (server.target.isWindows()) {
- server.linkSystemLibrary("ws2_32");
- }
-
- const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
- for (server_assets) |asset| {
- const input_path = b.fmt("examples/server/public/{s}", .{asset});
- const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
-
- // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
-
- const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
- defer b.allocator.free(input);
-
- var buf = std.ArrayList(u8).init(b.allocator);
- defer buf.deinit();
-
- for (input) |byte| {
- try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
- }
-
- var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
- defer b.allocator.free(name);
- std.mem.replaceScalar(u8, name, '.', '_');
-
- try std.fs.cwd().writeFile(output_path, b.fmt(
- "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
- .{ name, buf.items, name, input.len },
- ));
-
- std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
- }
-}
diff --git a/codecov.yml b/codecov.yml
deleted file mode 100644
index a301c5b2c76..00000000000
--- a/codecov.yml
+++ /dev/null
@@ -1,14 +0,0 @@
-comment: off
-
-coverage:
- status:
- project:
- default:
- target: auto
- threshold: 0
- base: auto
- patch:
- default:
- target: auto
- threshold: 0
- base: auto
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
deleted file mode 100755
index 5763b6664e8..00000000000
--- a/convert-hf-to-gguf.py
+++ /dev/null
@@ -1,2908 +0,0 @@
-#!/usr/bin/env python3
-
-from __future__ import annotations
-
-import argparse
-import contextlib
-import json
-import os
-import re
-import sys
-from abc import ABC, abstractmethod
-from enum import IntEnum
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
-
-import numpy as np
-import torch
-
-if TYPE_CHECKING:
- from torch import Tensor
-
-if 'NO_LOCAL_GGUF' not in os.environ:
- sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-from convert import LlamaHfVocab, permute
-
-
-###### MODEL DEFINITIONS ######
-
-class SentencePieceTokenTypes(IntEnum):
- NORMAL = 1
- UNKNOWN = 2
- CONTROL = 3
- USER_DEFINED = 4
- UNUSED = 5
- BYTE = 6
-
-
-AnyModel = TypeVar("AnyModel", bound="type[Model]")
-
-
-class Model(ABC):
- _model_classes: dict[str, type[Model]] = {}
-
- def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool):
- self.dir_model = dir_model
- self.ftype = ftype
- self.fname_out = fname_out
- self.is_big_endian = is_big_endian
- self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
- self.use_temp_file = use_temp_file
- self.is_safetensors = self._is_model_safetensors()
- self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
- self.part_names = self._get_part_names()
- self.hparams = Model.load_hparams(self.dir_model)
- self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
- self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
-
- @property
- @abstractmethod
- def model_arch(self) -> gguf.MODEL_ARCH:
- pass
-
- def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any:
- key = next((k for k in keys if k in self.hparams), None)
- if key is not None:
- return self.hparams[key]
- if optional:
- return None
- raise KeyError(f"could not find any of: {keys}")
-
- def set_vocab(self):
- self._set_vocab_gpt2()
-
- def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
- for part_name in self.part_names:
- print(f"gguf: loading model part '{part_name}'")
- ctx: ContextManager[Any]
- if self.is_safetensors:
- from safetensors import safe_open
- ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
- else:
- ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
-
- with ctx as model_part:
- for name in model_part.keys():
- data = model_part.get_tensor(name) if self.is_safetensors else model_part[name]
- yield name, data
-
- def set_gguf_parameters(self):
- self.gguf_writer.add_name(self.dir_model.name)
- self.gguf_writer.add_block_count(self.block_count)
-
- if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
- self.gguf_writer.add_context_length(n_ctx)
- print(f"gguf: context length = {n_ctx}")
-
- n_embd = self.find_hparam(["hidden_size", "n_embd"])
- self.gguf_writer.add_embedding_length(n_embd)
- print(f"gguf: embedding length = {n_embd}")
-
- if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
- self.gguf_writer.add_feed_forward_length(n_ff)
- print(f"gguf: feed forward length = {n_ff}")
-
- n_head = self.find_hparam(["num_attention_heads", "n_head"])
- self.gguf_writer.add_head_count(n_head)
- print(f"gguf: head count = {n_head}")
-
- if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
- self.gguf_writer.add_head_count_kv(n_head_kv)
- print(f"gguf: key-value head count = {n_head_kv}")
-
- if (rope_theta := self.hparams.get("rope_theta")) is not None:
- self.gguf_writer.add_rope_freq_base(rope_theta)
- print(f"gguf: rope theta = {rope_theta}")
- if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
- self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
- print(f"gguf: rms norm epsilon = {f_rms_eps}")
- if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
- self.gguf_writer.add_layer_norm_eps(f_norm_eps)
- print(f"gguf: layer norm epsilon = {f_norm_eps}")
- if (n_experts := self.hparams.get("num_local_experts")) is not None:
- self.gguf_writer.add_expert_count(n_experts)
- print(f"gguf: expert count = {n_experts}")
- if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
- self.gguf_writer.add_expert_used_count(n_experts_used)
- print(f"gguf: experts used count = {n_experts_used}")
-
- self.gguf_writer.add_file_type(self.ftype)
- print(f"gguf: file type = {self.ftype}")
-
- def write_tensors(self):
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- for name, data_torch in self.get_tensors():
- # we don't need these
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
- def write(self):
- self.write_tensors()
- self.gguf_writer.write_header_to_file()
- self.gguf_writer.write_kv_data_to_file()
- self.gguf_writer.write_tensors_to_file()
- self.gguf_writer.close()
-
- def write_vocab(self):
- self.gguf_writer.write_header_to_file()
- self.gguf_writer.write_kv_data_to_file()
- self.gguf_writer.close()
-
- @staticmethod
- def count_model_parts(dir_model: Path, prefix: str) -> int:
- num_parts = 0
- for filename in os.listdir(dir_model):
- if filename.endswith(prefix):
- num_parts += 1
-
- return num_parts
-
- @staticmethod
- def load_hparams(dir_model):
- with open(dir_model / "config.json", "r", encoding="utf-8") as f:
- return json.load(f)
-
- @classmethod
- def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
- assert names
-
- def func(modelcls: type[Model]):
- for name in names:
- cls._model_classes[name] = modelcls
- return modelcls
- return func
-
- @classmethod
- def from_model_architecture(cls, arch):
- try:
- return cls._model_classes[arch]
- except KeyError:
- raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
-
- def _is_model_safetensors(self) -> bool:
- return Model.count_model_parts(self.dir_model, ".safetensors") > 0
-
- def _get_part_names(self):
- if self.is_safetensors:
- if self.num_parts == 1: # there's only one .safetensors file
- return ("model.safetensors",)
- return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
-
- if self.num_parts == 1: # there's only one .bin file
- return ("pytorch_model.bin",)
- return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
-
- # used for GPT-2 BPE and WordPiece vocabs
- def get_basic_vocab(self) -> tuple[list[str], list[int]]:
- tokens: list[str] = []
- toktypes: list[int] = []
-
- from transformers import AutoTokenizer
- tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
- vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
- assert max(tokenizer.vocab.values()) < vocab_size
-
- reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
- added_vocab = tokenizer.get_added_vocab()
-
- for i in range(vocab_size):
- if i not in reverse_vocab:
- tokens.append(f"[PAD{i}]")
- toktypes.append(gguf.TokenType.USER_DEFINED)
- elif reverse_vocab[i] in added_vocab:
- tokens.append(reverse_vocab[i])
- if tokenizer.added_tokens_decoder[i].special:
- toktypes.append(gguf.TokenType.CONTROL)
- else:
- toktypes.append(gguf.TokenType.USER_DEFINED)
- else:
- tokens.append(reverse_vocab[i])
- toktypes.append(gguf.TokenType.NORMAL)
-
- return tokens, toktypes
-
- def _set_vocab_gpt2(self) -> None:
- tokens, toktypes = self.get_basic_vocab()
- self.gguf_writer.add_tokenizer_model("gpt2")
- self.gguf_writer.add_token_list(tokens)
- self.gguf_writer.add_token_types(toktypes)
-
- special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
- special_vocab.add_to_gguf(self.gguf_writer)
-
- def _set_vocab_qwen(self):
- dir_model = self.dir_model
- hparams = self.hparams
- tokens: list[str] = []
- toktypes: list[int] = []
-
- from transformers import AutoTokenizer
- tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
- vocab_size = hparams["vocab_size"]
- assert max(tokenizer.get_vocab().values()) < vocab_size
-
- merges = []
- vocab = {}
- mergeable_ranks = tokenizer.mergeable_ranks
- for token, rank in mergeable_ranks.items():
- vocab[QwenModel.token_bytes_to_string(token)] = rank
- if len(token) == 1:
- continue
- merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
- assert len(merged) == 2
- merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
-
- # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
- added_vocab = tokenizer.special_tokens
- reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
-
- for i in range(vocab_size):
- if i not in reverse_vocab:
- tokens.append(f"[PAD{i}]")
- toktypes.append(gguf.TokenType.USER_DEFINED)
- elif reverse_vocab[i] in added_vocab:
- tokens.append(reverse_vocab[i])
- toktypes.append(gguf.TokenType.CONTROL)
- else:
- tokens.append(reverse_vocab[i])
- toktypes.append(gguf.TokenType.NORMAL)
-
- self.gguf_writer.add_tokenizer_model("gpt2")
- self.gguf_writer.add_token_list(tokens)
- self.gguf_writer.add_token_types(toktypes)
-
- special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
- special_vocab.merges = merges
- # only add special tokens when they were not already loaded from config.json
- if len(special_vocab.special_token_ids) == 0:
- special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
- special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
- # this one is usually not in config.json anyway
- special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
- special_vocab.add_to_gguf(self.gguf_writer)
-
- def _set_vocab_sentencepiece(self):
- from sentencepiece import SentencePieceProcessor
-
- tokenizer_path = self.dir_model / 'tokenizer.model'
-
- tokens: list[bytes] = []
- scores: list[float] = []
- toktypes: list[int] = []
-
- if not tokenizer_path.is_file():
- raise FileNotFoundError(f"File not found: {tokenizer_path}")
-
- tokenizer = SentencePieceProcessor(str(tokenizer_path))
- vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
-
- for token_id in range(tokenizer.vocab_size()):
- piece = tokenizer.id_to_piece(token_id)
- text = piece.encode("utf-8")
- score = tokenizer.get_score(token_id)
-
- toktype = SentencePieceTokenTypes.NORMAL
- if tokenizer.is_unknown(token_id):
- toktype = SentencePieceTokenTypes.UNKNOWN
- elif tokenizer.is_control(token_id):
- toktype = SentencePieceTokenTypes.CONTROL
- elif tokenizer.is_unused(token_id):
- toktype = SentencePieceTokenTypes.UNUSED
- elif tokenizer.is_byte(token_id):
- toktype = SentencePieceTokenTypes.BYTE
-
- tokens.append(text)
- scores.append(score)
- toktypes.append(toktype)
-
- added_tokens_file = self.dir_model / 'added_tokens.json'
- if added_tokens_file.is_file():
- with open(added_tokens_file, "r", encoding="utf-8") as f:
- added_tokens_json = json.load(f)
-
- for key in added_tokens_json:
- key = key.encode("utf-8")
- if key not in tokens:
- tokens.append(key)
- scores.append(-1000.0)
- toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
-
- if vocab_size > len(tokens):
- pad_count = vocab_size - len(tokens)
- print(
- f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]"
- )
- for i in range(1, pad_count + 1):
- tokens.append(f"[PAD{i}]")
- scores.append(-1000.0)
- toktypes.append(SentencePieceTokenTypes.UNUSED)
-
- assert len(tokens) == vocab_size
-
- self.gguf_writer.add_tokenizer_model("llama")
- self.gguf_writer.add_token_list(tokens)
- self.gguf_writer.add_token_scores(scores)
- self.gguf_writer.add_token_types(toktypes)
-
- special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
- special_vocab.add_to_gguf(self.gguf_writer)
-
- def _set_vocab_llama_hf(self):
- vocab = LlamaHfVocab(self.dir_model)
- tokens = []
- scores = []
- toktypes = []
-
- for text, score, toktype in vocab.all_tokens():
- tokens.append(text)
- scores.append(score)
- toktypes.append(toktype)
-
- assert len(tokens) == vocab.vocab_size
-
- self.gguf_writer.add_tokenizer_model("llama")
- self.gguf_writer.add_token_list(tokens)
- self.gguf_writer.add_token_scores(scores)
- self.gguf_writer.add_token_types(toktypes)
-
- special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
- special_vocab.add_to_gguf(self.gguf_writer)
-
-
-@Model.register("GPTNeoXForCausalLM")
-class GPTNeoXModel(Model):
- model_arch = gguf.MODEL_ARCH.GPTNEOX
-
- def set_gguf_parameters(self):
- block_count = self.hparams["num_hidden_layers"]
-
- self.gguf_writer.add_name(self.dir_model.name)
- self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
- self.gguf_writer.add_rope_dimension_count(
- int(self.hparams["rotary_pct"] * (self.hparams["hidden_size"] // self.hparams["num_attention_heads"])),
- )
- self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
- self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
- self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
-
-
-@Model.register("BloomForCausalLM")
-class BloomModel(Model):
- model_arch = gguf.MODEL_ARCH.BLOOM
-
- def set_gguf_parameters(self):
- self.gguf_writer.add_name("Bloom")
- n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
- n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
- self.gguf_writer.add_context_length(self.hparams.get("seq_length", n_embed))
- self.gguf_writer.add_embedding_length(n_embed)
- self.gguf_writer.add_feed_forward_length(4 * n_embed)
- self.gguf_writer.add_block_count(self.hparams["n_layer"])
- self.gguf_writer.add_head_count(n_head)
- self.gguf_writer.add_head_count_kv(n_head)
- self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
- self.gguf_writer.add_file_type(self.ftype)
-
- def write_tensors(self):
- block_count = self.hparams["n_layer"]
- tensors = dict(self.get_tensors())
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- has_lm_head = True
- n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
- n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
-
- for name, data_torch in tensors.items():
- if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys():
- has_lm_head = False
-
- name = re.sub(r'transformer\.', '', name)
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
- # Map bloom-style qkv_linear to gpt-style qkv_linear
- # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
- # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
- qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
- data = np.concatenate(
- (
- qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
- qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
- qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
- ),
- axis=0,
- )
- print("re-format attention.linear_qkv.weight")
- elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
- qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
- data = np.concatenate(
- (
- qkv_bias[:, 0, :].reshape((n_embed,)),
- qkv_bias[:, 1, :].reshape((n_embed,)),
- qkv_bias[:, 2, :].reshape((n_embed,)),
- ),
- axis=0,
- )
- print("re-format attention.linear_qkv.bias")
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
- if not has_lm_head and name == "word_embeddings.weight":
- self.gguf_writer.add_tensor("output.weight", data)
- print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
-
-
-@Model.register("MPTForCausalLM")
-class MPTModel(Model):
- model_arch = gguf.MODEL_ARCH.MPT
-
- def set_vocab(self):
- try:
- self._set_vocab_gpt2()
- except Exception:
- # Fallback for SEA-LION model
- self._set_vocab_sentencepiece()
- self.gguf_writer.add_add_bos_token(False)
- self.gguf_writer.add_pad_token_id(3)
- self.gguf_writer.add_eos_token_id(1)
- self.gguf_writer.add_unk_token_id(0)
-
- def set_gguf_parameters(self):
- block_count = self.hparams["n_layers"]
- self.gguf_writer.add_name(self.dir_model.name)
- self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
- self.gguf_writer.add_embedding_length(self.hparams["d_model"])
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_feed_forward_length(4 * self.hparams["d_model"])
- self.gguf_writer.add_head_count(self.hparams["n_heads"])
- if kv_n_heads := self.hparams["attn_config"].get("kv_n_heads"):
- self.gguf_writer.add_head_count_kv(kv_n_heads)
- self.gguf_writer.add_layer_norm_eps(1e-5)
- if self.hparams["attn_config"]["clip_qkv"] is not None:
- self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"])
- if self.hparams["attn_config"]["alibi"]:
- self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"])
- else:
- self.gguf_writer.add_max_alibi_bias(0.0)
-
- def write_tensors(self):
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers"))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- for name, data_torch in self.get_tensors():
- # we don't need these
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- if "scales" in name:
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
- if new_name is not None:
- new_name = new_name.replace("scales", "act.scales")
- else:
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("OrionForCausalLM")
-class OrionModel(Model):
- model_arch = gguf.MODEL_ARCH.ORION
-
- def set_vocab(self):
- self._set_vocab_sentencepiece()
-
- def set_gguf_parameters(self):
- block_count = self.hparams["num_hidden_layers"]
- head_count = self.hparams["num_attention_heads"]
- head_count_kv = self.hparams.get("num_key_value_heads", head_count)
- hf_repo = self.hparams.get("_name_or_path", "")
-
- ctx_length = 0
- if "max_sequence_length" in self.hparams:
- ctx_length = self.hparams["max_sequence_length"]
- elif "max_position_embeddings" in self.hparams:
- ctx_length = self.hparams["max_position_embeddings"]
- elif "model_max_length" in self.hparams:
- ctx_length = self.hparams["model_max_length"]
- else:
- print("gguf: can not find ctx length parameter.")
- sys.exit()
-
- self.gguf_writer.add_file_type(self.ftype)
- self.gguf_writer.add_name(self.dir_model.name)
- self.gguf_writer.add_source_hf_repo(hf_repo)
- self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
- self.gguf_writer.add_context_length(ctx_length)
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
- self.gguf_writer.add_head_count(head_count)
- self.gguf_writer.add_head_count_kv(head_count_kv)
- # note: config provides rms norm but it is actually layer norm
- # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
- self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
-
- def write_tensors(self):
- # Collect tensors from generator object
- model_kv = dict(self.get_tensors())
- block_count = self.hparams["num_hidden_layers"]
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
- for name, data_torch in model_kv.items():
- # we don't need these
- if name.endswith(".rotary_emb.inv_freq"):
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
-class BaichuanModel(Model):
- model_arch = gguf.MODEL_ARCH.BAICHUAN
-
- def set_vocab(self):
- self._set_vocab_sentencepiece()
-
- def set_gguf_parameters(self):
- block_count = self.hparams["num_hidden_layers"]
- head_count = self.hparams["num_attention_heads"]
- head_count_kv = self.hparams.get("num_key_value_heads", head_count)
- hf_repo = self.hparams.get("_name_or_path", "")
-
- ctx_length = 0
- if "max_sequence_length" in self.hparams:
- ctx_length = self.hparams["max_sequence_length"]
- elif "max_position_embeddings" in self.hparams:
- ctx_length = self.hparams["max_position_embeddings"]
- elif "model_max_length" in self.hparams:
- ctx_length = self.hparams["model_max_length"]
- else:
- print("gguf: can not find ctx length parameter.")
- sys.exit()
-
- self.gguf_writer.add_name(self.dir_model.name)
- self.gguf_writer.add_source_hf_repo(hf_repo)
- self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
- self.gguf_writer.add_context_length(ctx_length)
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
- self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
- self.gguf_writer.add_head_count(head_count)
- self.gguf_writer.add_head_count_kv(head_count_kv)
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-
- if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
- if self.hparams["rope_scaling"].get("type") == "linear":
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
- self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-
- def write_tensors(self):
- # Collect tensors from generator object
- model_kv = dict(self.get_tensors())
- block_count = self.hparams["num_hidden_layers"]
- head_count = self.hparams["num_attention_heads"]
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- head_count_kv = self.hparams.get("num_key_value_heads", head_count)
-
- for i in range(block_count):
- if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None:
- print(f"Unpacking and permuting layer {i}")
- model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \
- self._reverse_hf_permute_part(w, 0, head_count, head_count)
- model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \
- self._reverse_hf_permute_part(w, 1, head_count, head_count_kv)
- model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \
- self._reverse_hf_part(w, 2)
- del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"]
-
- for name, data_torch in model_kv.items():
- # we don't need these
- if name.endswith(".rotary_emb.inv_freq"):
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
- self.gguf_writer.add_tensor(new_name, data)
-
- def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
- if n_kv_head is not None and n_head != n_kv_head:
- n_head //= n_kv_head
-
- return (
- weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
- .swapaxes(1, 2)
- .reshape(weights.shape)
- )
-
- def _reverse_hf_permute_part(
- self, weights: Tensor, n_part: int, n_head: int, n_head_kv: int | None = None,
- ) -> Tensor:
- r = weights.shape[0] // 3
- return self._reverse_hf_permute(weights[r * n_part:r * n_part + r, ...], n_head, n_head_kv)
-
- def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
- r = weights.shape[0] // 3
- return weights[r * n_part:r * n_part + r, ...]
-
-
-@Model.register("XverseForCausalLM")
-class XverseModel(Model):
- model_arch = gguf.MODEL_ARCH.XVERSE
-
- def set_vocab(self):
- assert (self.dir_model / "tokenizer.json").is_file()
- dir_model = self.dir_model
- hparams = self.hparams
-
- tokens: list[bytearray] = []
- toktypes: list[int] = []
-
- from transformers import AutoTokenizer
- tokenizer = AutoTokenizer.from_pretrained(dir_model)
- vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
- assert max(tokenizer.vocab.values()) < vocab_size
-
- reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
- added_vocab = tokenizer.get_added_vocab()
-
- for token_id in range(vocab_size):
- token_text = reverse_vocab[token_id].encode('utf-8')
- # replace "\x00" to string with length > 0
- if token_text == b"\x00":
- toktype = gguf.TokenType.BYTE # special
- token_text = f"<{token_text}>".encode('utf-8')
- elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
- toktype = gguf.TokenType.BYTE # special
- elif reverse_vocab[token_id] in added_vocab:
- if tokenizer.added_tokens_decoder[token_id].special:
- toktype = gguf.TokenType.CONTROL
- else:
- toktype = gguf.TokenType.USER_DEFINED
- else:
- toktype = gguf.TokenType.NORMAL
-
- tokens.append(token_text)
- toktypes.append(toktype)
-
- self.gguf_writer.add_tokenizer_model("llama")
- self.gguf_writer.add_token_list(tokens)
- self.gguf_writer.add_token_types(toktypes)
-
- special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens))
- special_vocab.add_to_gguf(self.gguf_writer)
-
- def set_gguf_parameters(self):
- block_count = self.hparams["num_hidden_layers"]
- head_count = self.hparams["num_attention_heads"]
- head_count_kv = self.hparams.get("num_key_value_heads", head_count)
- hf_repo = self.hparams.get("_name_or_path", "")
-
- ctx_length = 0
- if "max_sequence_length" in self.hparams:
- ctx_length = self.hparams["max_sequence_length"]
- elif "max_position_embeddings" in self.hparams:
- ctx_length = self.hparams["max_position_embeddings"]
- elif "model_max_length" in self.hparams:
- ctx_length = self.hparams["model_max_length"]
- else:
- print("gguf: can not find ctx length parameter.")
- sys.exit()
-
- self.gguf_writer.add_name(self.dir_model.name)
- self.gguf_writer.add_source_hf_repo(hf_repo)
- self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
- self.gguf_writer.add_context_length(ctx_length)
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
- self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
- self.gguf_writer.add_head_count(head_count)
- self.gguf_writer.add_head_count_kv(head_count_kv)
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-
- if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
- if self.hparams["rope_scaling"].get("type") == "linear":
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
- self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
-
- def write_tensors(self):
- # Collect tensors from generator object
- model_kv = dict(self.get_tensors())
- block_count = self.hparams["num_hidden_layers"]
- head_count = self.hparams["num_attention_heads"]
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- head_count_kv = self.hparams.get("num_key_value_heads", head_count)
-
- for name, data_torch in model_kv.items():
- # we don't need these
- if name.endswith(".rotary_emb.inv_freq"):
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- # HF models permute some of the tensors, so we need to undo that
- if name.endswith(("q_proj.weight")):
- data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
- if name.endswith(("k_proj.weight")):
- data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
- self.gguf_writer.add_tensor(new_name, data)
-
- def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
- if n_kv_head is not None and n_head != n_kv_head:
- n_head //= n_kv_head
-
- return (
- weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
- .swapaxes(1, 2)
- .reshape(weights.shape)
- )
-
-
-@Model.register("FalconForCausalLM", "RWForCausalLM")
-class FalconModel(Model):
- model_arch = gguf.MODEL_ARCH.FALCON
-
- def set_gguf_parameters(self):
- block_count = self.hparams.get("num_hidden_layers")
- if block_count is None:
- block_count = self.hparams["n_layer"] # old name
-
- n_head = self.hparams.get("num_attention_heads")
- if n_head is None:
- n_head = self.hparams["n_head"] # old name
-
- n_head_kv = self.hparams.get("num_kv_heads")
- if n_head_kv is None:
- n_head_kv = self.hparams.get("n_head_kv", 1) # old name
-
- self.gguf_writer.add_name("Falcon")
- self.gguf_writer.add_context_length(2048) # not in config.json
- self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
- self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_head_count(n_head)
- self.gguf_writer.add_head_count_kv(n_head_kv)
- self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
- self.gguf_writer.add_file_type(self.ftype)
-
- def write_tensors(self):
- block_count = self.hparams.get("num_hidden_layers")
- if block_count is None:
- block_count = self.hparams["n_layer"] # old name
-
- n_head = self.hparams.get("num_attention_heads")
- if n_head is None:
- n_head = self.hparams["n_head"] # old name
-
- n_head_kv = self.hparams.get("num_kv_heads")
- if n_head_kv is None:
- n_head_kv = self.hparams.get("n_head_kv", 1) # old name
-
- head_dim = self.hparams["hidden_size"] // n_head
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
- for name, data_torch in self.get_tensors():
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- # QKV tensor transform
- # The original query_key_value tensor contains n_head_kv "kv groups",
- # each consisting of n_head/n_head_kv query weights followed by one key
- # and one value weight (shared by all query heads in the kv group).
- # This layout makes it a big pain to work with in GGML.
- # So we rearrange them here,, so that we have n_head query weights
- # followed by n_head_kv key weights followed by n_head_kv value weights,
- # in contiguous fashion.
- # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
-
- if "query_key_value" in name:
- qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
- q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
- k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
- v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
- data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("GPTBigCodeForCausalLM")
-class StarCoderModel(Model):
- model_arch = gguf.MODEL_ARCH.STARCODER
-
- def set_gguf_parameters(self):
- block_count = self.hparams["n_layer"]
-
- self.gguf_writer.add_name("StarCoder")
- self.gguf_writer.add_context_length(self.hparams["n_positions"])
- self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
- self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_head_count(self.hparams["n_head"])
- self.gguf_writer.add_head_count_kv(1)
- self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
- self.gguf_writer.add_file_type(self.ftype)
-
-
-@Model.register("GPTRefactForCausalLM")
-class RefactModel(Model):
- model_arch = gguf.MODEL_ARCH.REFACT
-
- def set_gguf_parameters(self):
- hidden_dim = self.hparams["n_embd"]
- inner_dim = 4 * hidden_dim
- hidden_dim = int(2 * inner_dim / 3)
- multiple_of = 256
- ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
-
- block_count = self.hparams["n_layer"]
-
- self.gguf_writer.add_name("Refact")
- # refact uses Alibi. So this is from config.json which might be used by training.
- self.gguf_writer.add_context_length(self.hparams["n_positions"])
- self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
-
- self.gguf_writer.add_feed_forward_length(ff_dim)
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_head_count(self.hparams["n_head"])
- self.gguf_writer.add_head_count_kv(1)
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
- self.gguf_writer.add_file_type(self.ftype)
-
- def write_tensors(self):
- hidden_dim = self.hparams["n_embd"]
- inner_dim = 4 * hidden_dim
- hidden_dim = int(2 * inner_dim / 3)
- multiple_of = 256
- ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
- n_head = self.hparams["n_head"]
- n_head_kv = 1
- head_dim = self.hparams["n_embd"] // n_head
- block_count = self.hparams["n_layer"]
-
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
- tensors = dict(self.get_tensors())
- for i in range(block_count):
- if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None:
- tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim]
- tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:]
- del tensors[f"transformer.h.{i}.attn.kv.weight"]
- if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None:
- tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w
- del tensors[f"transformer.h.{i}.attn.q.weight"]
- if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None:
- tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim]
- tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:]
- del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
-
- for name, data_torch in tensors.items():
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("PersimmonForCausalLM")
-class PersimmonModel(Model):
- model_arch = gguf.MODEL_ARCH.PERSIMMON
-
- def set_gguf_parameters(self):
- block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
- head_count = self.hparams["num_attention_heads"]
- head_count_kv = head_count
- hidden_size = self.hparams["hidden_size"]
-
- self.gguf_writer.add_name('persimmon-8b-chat')
- self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
- self.gguf_writer.add_embedding_length(hidden_size)
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-
- # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
- # than the head size?
- # ref: https://github.com/ggerganov/llama.cpp/pull/4889
- # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
- self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
-
- self.gguf_writer.add_head_count(head_count)
- self.gguf_writer.add_head_count_kv(head_count_kv)
- self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
- self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
-
- def set_vocab(self):
- self._set_vocab_sentencepiece()
- # self.gguf_writer.add_bos_token_id(71013)
- # self.gguf_writer.add_eos_token_id(71013)
-
- def write_tensors(self):
- block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
- for name, data_torch in self.get_tensors():
- if name.endswith(".self_attention.rotary_emb.inv_freq"):
- continue
- old_dtype = data_torch.dtype
- # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
- data = data_torch.to(torch.float32).squeeze().numpy()
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
- n_dims = len(data.shape)
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
-class StableLMModel(Model):
- model_arch = gguf.MODEL_ARCH.STABLELM
-
- def set_vocab(self):
- if (self.dir_model / "tokenizer.json").is_file():
- self._set_vocab_gpt2()
- else:
- # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
- self._set_vocab_qwen()
-
- def set_gguf_parameters(self):
- hparams = self.hparams
- block_count = hparams["num_hidden_layers"]
-
- self.gguf_writer.add_name(self.dir_model.name)
- self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
- self.gguf_writer.add_embedding_length(hparams["hidden_size"])
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
- rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"])
- self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"])))
- self.gguf_writer.add_head_count(hparams["num_attention_heads"])
- self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
- self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
- self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"]))
-
- def write_tensors(self):
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- n_head = self.hparams.get("num_attention_heads")
- n_kv_head = self.hparams.get("num_key_value_heads")
- q_norms = dict()
- k_norms = dict()
- for name, data_torch in self.get_tensors():
- # we don't need these
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
- n_dims = len(data.shape)
- if name.find("q_layernorm.norms") != -1:
- q_norms[name] = data
- if len(q_norms) >= (block_count * n_head):
- self._stack_qk_norm(block_count, name, tensor_map, n_head, q_norms, n_dims, layer_name="q_layernorm")
- continue
- if name.find("k_layernorm.norms") != -1:
- k_norms[name] = data
- if len(k_norms) >= (block_count * n_kv_head):
- self._stack_qk_norm(block_count, name, tensor_map, n_kv_head, k_norms, n_dims, layer_name="k_layernorm")
- continue
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
- def _stack_qk_norm(self, block_count, name, tensor_map, n_head, norms, n_dims, layer_name="q_layernorm"):
- for bid in range(block_count):
- datas = []
- for xid in range(n_head):
- ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
- datas.append(norms[ename])
- del norms[ename]
- data = np.stack(datas, axis=0)
- data_dtype = data.dtype
- merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
- new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and not new_name.endswith("_norm.weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
-class LlamaModel(Model):
- model_arch = gguf.MODEL_ARCH.LLAMA
-
- def set_vocab(self):
- try:
- self. _set_vocab_sentencepiece()
- except FileNotFoundError:
- try:
- self._set_vocab_llama_hf()
- except (FileNotFoundError, TypeError):
- # Llama 3
- self._set_vocab_gpt2()
-
- # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
- if self.hparams.get("vocab_size", 32000) == 32016:
- special_vocab = gguf.SpecialVocab(
- self.dir_model, load_merges=False,
- special_token_types = ['prefix', 'suffix', 'middle', 'eot']
- )
- special_vocab._set_special_token("prefix", 32007)
- special_vocab._set_special_token("suffix", 32008)
- special_vocab._set_special_token("middle", 32009)
- special_vocab._set_special_token("eot", 32010)
- special_vocab.add_to_gguf(self.gguf_writer)
-
- def set_gguf_parameters(self):
- super().set_gguf_parameters()
- hparams = self.hparams
- self.gguf_writer.add_vocab_size(hparams["vocab_size"])
- self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
-
- # Same as super class, but permuting q_proj, k_proj
- def write_tensors(self):
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- n_head = self.hparams.get("num_attention_heads")
- n_kv_head = self.hparams.get("num_key_value_heads")
- n_experts = self.hparams.get("num_local_experts")
- experts = dict()
- for name, data_torch in self.get_tensors():
- # we don't need these
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.numpy()
-
- if name.endswith("q_proj.weight"):
- data = permute(data, n_head, n_head)
- if name.endswith("k_proj.weight"):
- data = permute(data, n_head, n_kv_head)
-
- data = data.squeeze()
-
- # process the experts separately
- if name.find("block_sparse_moe.experts") != -1:
- experts[name] = data
- if len(experts) >= n_experts:
- # merge the experts into a single 3d tensor
- for bid in range(block_count):
- for wid in range(1, 4):
- full = True
- for xid in range(n_experts):
- ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
- if ename not in experts:
- full = False
- break
- if not full:
- continue
-
- datas = []
- for xid in range(n_experts):
- ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
- datas.append(experts[ename])
- del experts[ename]
-
- data = np.stack(datas, axis=0)
- data_dtype = data.dtype
-
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- if self.ftype == 1 and data_dtype == np.float32:
- data = data.astype(np.float16)
-
- merged_name = f"layers.{bid}.feed_forward.experts.w{wid}.weight"
-
- new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
- continue
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # 1d tensors need to be converted to float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
- if len(experts) > 0:
- raise ValueError(f"Unprocessed experts: {experts.keys()}")
-
-
-@Model.register("GrokForCausalLM")
-class GrokModel(Model):
- model_arch = gguf.MODEL_ARCH.GROK
-
- def set_vocab(self):
- self._set_vocab_sentencepiece()
-
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
-
- def set_gguf_parameters(self):
- super().set_gguf_parameters()
- self.gguf_writer.add_name("Grok")
-
- def write_tensors(self):
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- n_experts = self.hparams.get("num_local_experts")
- experts = dict()
- for name, data_torch in self.get_tensors():
- # we don't need these
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- # process the experts separately
- if name.find(".moe.") != -1:
- experts[name] = data
- if len(experts) >= n_experts:
- # merge the experts into a single 3d tensor
- for bid in range(block_count):
- for wid in ["linear", "linear_1", "linear_v"]:
- full = True
- for xid in range(n_experts):
- ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
- if ename not in experts:
- full = False
- break
- if not full:
- continue
-
- datas = []
- for xid in range(n_experts):
- ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
- datas.append(experts[ename])
- del experts[ename]
-
- data = np.stack(datas, axis=0)
- data_dtype = data.dtype
-
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- if self.ftype == 1 and data_dtype == np.float32:
- data = data.astype(np.float16)
-
- merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
-
- new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
- continue
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("DbrxForCausalLM")
-class DbrxModel(Model):
- model_arch = gguf.MODEL_ARCH.DBRX
-
- def set_gguf_parameters(self):
- ffn_config = self.hparams["ffn_config"]
- attn_config = self.hparams["attn_config"]
- self.gguf_writer.add_name(self.hparams["model_type"])
- self.gguf_writer.add_block_count(self.hparams["n_layers"])
-
- self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
- self.gguf_writer.add_embedding_length(self.hparams["d_model"])
- self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"])
-
- self.gguf_writer.add_head_count(self.hparams["n_heads"])
- self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
-
- self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
-
- self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"])
- self.gguf_writer.add_file_type(self.ftype)
-
- self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
- self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
-
- self.gguf_writer.add_layer_norm_eps(1e-5)
-
- self.gguf_writer.add_file_type(self.ftype)
- print(f"gguf: file type = {self.ftype}")
-
- def write_tensors(self):
- block_count = self.hparams.get("n_layers")
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- for name, data_torch in self.get_tensors():
- n_expert = self.hparams["ffn_config"]["moe_num_experts"]
- n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
- n_embd = self.hparams["d_model"]
-
- # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
- # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
- # But llama.cpp moe graph works differently
- # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
- # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
- exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
- "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
- "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
- experts = False
- for exp_tensor_name in exp_tensor_names.keys():
- if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
- experts = True
- data_torch = data_torch.view(n_expert, n_ff, n_embd)
- if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None:
- data_torch = data_torch.permute(*permute_tensor)
- break
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- # In MoE models the ffn tensors are typically most of the model weights,
- # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
- # Every other model has the weight names ending in .weight,
- # let's assume that is the convention which is not the case for dbrx:
- # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
- new_name = tensor_map.get_name(name if not experts else name + ".weight", try_suffixes=(".weight",))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # Most of the codebase that takes in 1D tensors only handles F32 tensors
- # and most of the outputs tensors are F32.
- if data_dtype != np.float32 and n_dims == 1:
- print(f"Can not map tensor {name!r}: all 1D tensors must be F32")
- sys.exit()
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and n_dims > 1:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("MiniCPMForCausalLM")
-class MiniCPMModel(Model):
- model_arch = gguf.MODEL_ARCH.MINICPM
-
- def set_gguf_parameters(self):
- block_count = self.hparams["num_hidden_layers"]
- self.gguf_writer.add_name("MiniCPM")
- self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
- self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
- self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
- self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
- self.gguf_writer.add_file_type(self.ftype)
-
- def set_vocab(self):
- self._set_vocab_llama_hf()
-
- def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
- if n_kv_head is not None and n_head != n_kv_head:
- n_head //= n_kv_head
-
- return (
- weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
- .swapaxes(1, 2)
- .reshape(weights.shape)
- )
-
- def write_tensors(self):
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- n_head = self.hparams.get("num_attention_heads")
- n_kv_head = self.hparams.get("num_key_value_heads")
- for name, data_torch in self.get_tensors():
- # we don't need these
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- # HF models permute some of the tensors, so we need to undo that
- if name.endswith(("q_proj.weight")):
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
- if name.endswith(("k_proj.weight")):
- data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("QWenLMHeadModel")
-class QwenModel(Model):
- model_arch = gguf.MODEL_ARCH.QWEN
-
- @staticmethod
- def token_bytes_to_string(b):
- from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
- byte_encoder = bytes_to_unicode()
- return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
-
- @staticmethod
- def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
- parts = [bytes([b]) for b in token]
- while True:
- min_idx = None
- min_rank = None
- for i, pair in enumerate(zip(parts[:-1], parts[1:])):
- rank = mergeable_ranks.get(pair[0] + pair[1])
- if rank is not None and (min_rank is None or rank < min_rank):
- min_idx = i
- min_rank = rank
- if min_rank is None or (max_rank is not None and min_rank >= max_rank):
- break
- assert min_idx is not None
- parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
- return parts
-
- def set_vocab(self):
- self._set_vocab_qwen()
-
- def set_gguf_parameters(self):
- self.gguf_writer.add_name("Qwen")
- self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
- self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
- self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
- self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
- self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
-
- def write_tensors(self):
- block_count = self.hparams["num_hidden_layers"]
- model_kv = dict(self.get_tensors())
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- for name, data_torch in model_kv.items():
- # we don't need these
- if name.endswith(".rotary_emb.inv_freq"):
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("Qwen2ForCausalLM")
-class Qwen2Model(Model):
- model_arch = gguf.MODEL_ARCH.QWEN2
-
- def set_vocab(self):
- try:
- self._set_vocab_sentencepiece()
- except FileNotFoundError:
- self._set_vocab_gpt2()
-
-
-@Model.register("Qwen2MoeForCausalLM")
-class Qwen2MoeModel(Model):
- model_arch = gguf.MODEL_ARCH.QWEN2MOE
-
- def set_gguf_parameters(self):
- super().set_gguf_parameters()
- if (n_experts := self.hparams.get("num_experts")) is not None:
- self.gguf_writer.add_expert_count(n_experts)
-
- def write_tensors(self):
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- n_experts = self.hparams.get("num_experts")
- experts = dict()
- for name, data_torch in self.get_tensors():
- # we don't need these
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- # process the experts separately
- if name.find("experts") != -1:
- experts[name] = data
- if len(experts) >= n_experts * 3:
- # merge the experts into a single 3d tensor
- for bid in range(block_count):
- for w_name in ["down_proj", "gate_proj", "up_proj"]:
- full = True
- for xid in range(n_experts):
- ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
- if ename not in experts:
- full = False
- break
- if not full:
- continue
-
- datas = []
- for xid in range(n_experts):
- ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
- datas.append(experts[ename])
- del experts[ename]
-
- data = np.stack(datas, axis=0)
- data_dtype = data.dtype
-
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- if self.ftype == 1 and data_dtype == np.float32:
- data = data.astype(np.float16)
-
- merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-
- new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
- continue
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and (n_dims == 1 or new_name.endswith("_norm.weight")):
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, shape = {data.shape}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
- if len(experts) > 0:
- raise ValueError(f"Unprocessed experts: {experts.keys()}")
-
-
-@Model.register("GPT2LMHeadModel")
-class GPT2Model(Model):
- model_arch = gguf.MODEL_ARCH.GPT2
-
- def set_gguf_parameters(self):
- self.gguf_writer.add_name(self.dir_model.name)
- self.gguf_writer.add_block_count(self.hparams["n_layer"])
- self.gguf_writer.add_context_length(self.hparams["n_ctx"])
- self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
- self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
- self.gguf_writer.add_head_count(self.hparams["n_head"])
- self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
- self.gguf_writer.add_file_type(self.ftype)
-
- def write_tensors(self):
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
- for name, data_torch in self.get_tensors():
- # we don't need these
- if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")):
- continue
-
- if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
- data_torch = data_torch.transpose(1, 0)
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
- # note: GPT2 output is tied to (same as) wte in original model
- if new_name == "token_embd.weight":
- print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
- self.gguf_writer.add_tensor("output.weight", data)
-
-
-@Model.register("PhiForCausalLM")
-class Phi2Model(Model):
- model_arch = gguf.MODEL_ARCH.PHI2
-
- def set_gguf_parameters(self):
- block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
-
- rot_pct = self.find_hparam(["partial_rotary_factor"])
- n_embd = self.find_hparam(["hidden_size", "n_embd"])
- n_head = self.find_hparam(["num_attention_heads", "n_head"])
-
- self.gguf_writer.add_name("Phi2")
- self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
-
- self.gguf_writer.add_embedding_length(n_embd)
- self.gguf_writer.add_feed_forward_length(4 * n_embd)
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_head_count(n_head)
- self.gguf_writer.add_head_count_kv(n_head)
- self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_epsilon", "layer_norm_eps"]))
- self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
- self.gguf_writer.add_file_type(self.ftype)
- self.gguf_writer.add_add_bos_token(False)
-
-
-@Model.register("Phi3ForCausalLM")
-class Phi3MiniModel(Model):
- model_arch = gguf.MODEL_ARCH.PHI3
-
- def set_vocab(self):
- from sentencepiece import SentencePieceProcessor
-
- tokenizer_path = self.dir_model / 'tokenizer.model'
-
- if not tokenizer_path.is_file():
- print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
- sys.exit(1)
-
- tokenizer = SentencePieceProcessor(str(tokenizer_path))
-
- vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
-
- tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
- scores: list[float] = [-10000.0] * vocab_size
- toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
-
- for token_id in range(tokenizer.vocab_size()):
-
- piece = tokenizer.id_to_piece(token_id)
- text = piece.encode("utf-8")
- score = tokenizer.get_score(token_id)
-
- toktype = SentencePieceTokenTypes.NORMAL
- if tokenizer.is_unknown(token_id):
- toktype = SentencePieceTokenTypes.UNKNOWN
- elif tokenizer.is_control(token_id):
- toktype = SentencePieceTokenTypes.CONTROL
- elif tokenizer.is_unused(token_id):
- toktype = SentencePieceTokenTypes.UNUSED
- elif tokenizer.is_byte(token_id):
- toktype = SentencePieceTokenTypes.BYTE
-
- tokens[token_id] = text
- scores[token_id] = score
- toktypes[token_id] = toktype
-
- added_tokens_file = self.dir_model / 'added_tokens.json'
- if added_tokens_file.is_file():
- with open(added_tokens_file, "r", encoding="utf-8") as f:
- added_tokens_json = json.load(f)
-
- for key in added_tokens_json:
- token_id = added_tokens_json[key]
- if (token_id >= vocab_size):
- print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
- continue
-
- tokens[token_id] = key.encode("utf-8")
- scores[token_id] = -1000.0
- toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
-
- self.gguf_writer.add_tokenizer_model("llama")
- self.gguf_writer.add_token_list(tokens)
- self.gguf_writer.add_token_scores(scores)
- self.gguf_writer.add_token_types(toktypes)
-
- special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
- special_vocab.add_to_gguf(self.gguf_writer)
-
- def set_gguf_parameters(self):
- block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
-
- rot_pct = 1.0
- n_embd = self.find_hparam(["hidden_size", "n_embd"])
- n_head = self.find_hparam(["num_attention_heads", "n_head"])
- rms_eps = self.find_hparam(["rms_norm_eps"])
-
- self.gguf_writer.add_name("Phi3")
- self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
-
- self.gguf_writer.add_embedding_length(n_embd)
- self.gguf_writer.add_feed_forward_length(8192)
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_head_count(n_head)
- self.gguf_writer.add_head_count_kv(n_head)
- self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
- self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
- self.gguf_writer.add_file_type(self.ftype)
-
-
-@Model.register("PlamoForCausalLM")
-class PlamoModel(Model):
- model_arch = gguf.MODEL_ARCH.PLAMO
-
- def set_vocab(self):
- self._set_vocab_sentencepiece()
-
- def set_gguf_parameters(self):
- hparams = self.hparams
- block_count = hparams["num_hidden_layers"]
-
- self.gguf_writer.add_name("PLaMo")
- self.gguf_writer.add_context_length(4096) # not in config.json
- self.gguf_writer.add_embedding_length(hparams["hidden_size"])
- self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_head_count(hparams["num_attention_heads"])
- self.gguf_writer.add_head_count_kv(5) # hparams["num_key_value_heads"]) is wrong
- self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
-
- def shuffle_attn_q_weight(self, data_torch):
- assert data_torch.size() == (5120, 5120)
- data_torch = data_torch.reshape(8, 5, 128, 5120)
- data_torch = torch.permute(data_torch, (1, 0, 2, 3))
- data_torch = torch.reshape(data_torch, (5120, 5120))
- return data_torch
-
- def shuffle_attn_output_weight(self, data_torch):
- assert data_torch.size() == (5120, 5120)
- data_torch = data_torch.reshape(5120, 8, 5, 128)
- data_torch = torch.permute(data_torch, (0, 2, 1, 3))
- data_torch = torch.reshape(data_torch, (5120, 5120))
- return data_torch
-
- def write_tensors(self):
- block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
- for name, data_torch in self.get_tensors():
- if "self_attn.rotary_emb.inv_freq" in name:
- continue
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- # shuffle for broadcasting of gqa in ggml_mul_mat
- if new_name.endswith("attn_q.weight"):
- data_torch = self.shuffle_attn_q_weight(data_torch)
- elif new_name.endswith("attn_output.weight"):
- data_torch = self.shuffle_attn_output_weight(data_torch)
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("CodeShellForCausalLM")
-class CodeShellModel(Model):
- model_arch = gguf.MODEL_ARCH.CODESHELL
-
- def set_gguf_parameters(self):
- block_count = self.hparams["n_layer"]
-
- self.gguf_writer.add_name("CodeShell")
- self.gguf_writer.add_context_length(self.hparams["n_positions"])
- self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
- self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_head_count(self.hparams["n_head"])
- self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
- self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
- self.gguf_writer.add_file_type(self.ftype)
- self.gguf_writer.add_rope_freq_base(10000.0)
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
- self.gguf_writer.add_rope_scaling_factor(1.0)
-
- def write_tensors(self):
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- tensors = dict(self.get_tensors())
- has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
- for name, data_torch in tensors.items():
- # we don't need these
- if name.endswith((".attn.rotary_emb.inv_freq")):
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
- if not has_lm_head and name == "transformer.wte.weight":
- self.gguf_writer.add_tensor("output.weight", data)
- print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
-
-
-@Model.register("InternLM2ForCausalLM")
-class InternLM2Model(Model):
- model_arch = gguf.MODEL_ARCH.INTERNLM2
-
- def set_vocab(self):
- # (TODO): Is there a better way?
- # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
- # \x00 specially and convert it into an emoji character to prevent it from being mistakenly
- # recognized as an empty string in C++.
- from sentencepiece import SentencePieceProcessor
- from sentencepiece import sentencepiece_model_pb2 as model
-
- tokenizer_path = self.dir_model / 'tokenizer.model'
-
- tokens: list[bytes] = []
- scores: list[float] = []
- toktypes: list[int] = []
-
- if not tokenizer_path.is_file():
- print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
- sys.exit(1)
-
- sentencepiece_model = model.ModelProto()
- sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
- add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
-
- tokenizer = SentencePieceProcessor(str(tokenizer_path))
- vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
-
- for token_id in range(vocab_size):
- piece = tokenizer.id_to_piece(token_id)
- text = piece.encode("utf-8")
- score = tokenizer.get_score(token_id)
- if text == b"\x00":
- # (TODO): fixme
- # Hack here and replace the \x00 characters.
- print(f"InternLM2 convert token '{text}' to '🐉'!")
- text = "🐉"
-
- toktype = SentencePieceTokenTypes.NORMAL
- if tokenizer.is_unknown(token_id):
- toktype = SentencePieceTokenTypes.UNKNOWN
- elif tokenizer.is_control(token_id):
- toktype = SentencePieceTokenTypes.CONTROL
- elif tokenizer.is_unused(token_id):
- toktype = SentencePieceTokenTypes.UNUSED
- elif tokenizer.is_byte(token_id):
- toktype = SentencePieceTokenTypes.BYTE
-
- tokens.append(text)
- scores.append(score)
- toktypes.append(toktype)
-
- added_tokens_file = self.dir_model / 'added_tokens.json'
- if added_tokens_file.is_file():
- with open(added_tokens_file, "r", encoding="utf-8") as f:
- added_tokens_json = json.load(f)
-
- for key in added_tokens_json:
- tokens.append(key.encode("utf-8"))
- scores.append(-1000.0)
- toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
-
- self.gguf_writer.add_tokenizer_model("llama")
- self.gguf_writer.add_token_list(tokens)
- self.gguf_writer.add_token_scores(scores)
- self.gguf_writer.add_token_types(toktypes)
- self.gguf_writer.add_add_space_prefix(add_prefix)
-
- special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
- old_eos = special_vocab.special_token_ids["eos"]
- if "chat" in os.path.basename(self.dir_model.absolute()):
- # For the chat model, we replace the eos with '<|im_end|>'.
- # TODO: this is a hack, should be fixed
- # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
- special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
- print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
-in chat mode so that the conversation can end normally.")
-
- special_vocab.add_to_gguf(self.gguf_writer)
-
- def _try_get_sft_eos(self, tokenizer):
- unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]')
- im_end_list = tokenizer.encode('<|im_end|>')
- assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1)
- if len(unused_145_list) == 1:
- eos_token = unused_145_list[0]
- if len(im_end_list) == 1:
- eos_token = im_end_list[0]
- return eos_token
-
- def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int):
- if n_head_kv is not None and n_head != n_head_kv:
- n_head = n_head_kv
- return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
- .swapaxes(1, 2)
- .reshape(weights.shape))
-
- def set_gguf_parameters(self):
- self.gguf_writer.add_name("InternLM2")
- self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
- self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
- self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
- self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
- self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
- self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
- self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
-
- def post_write_tensors(self, tensor_map, name, data_torch):
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
- self.gguf_writer.add_tensor(new_name, data)
-
- def write_tensors(self):
- from einops import rearrange
-
- num_heads = self.hparams.get("num_attention_heads")
- num_kv_heads = self.hparams.get("num_key_value_heads")
- hidden_size = self.hparams.get("hidden_size")
- q_per_kv = num_heads // num_kv_heads
- head_dim = hidden_size // num_heads
- num_groups = num_heads // q_per_kv
-
- block_count = self.hparams["num_hidden_layers"]
- model_kv = dict(self.get_tensors())
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv"
- for name, data_torch in model_kv.items():
- # we don't need these
- if name.endswith(".rotary_emb.inv_freq"):
- continue
-
- if re.match(qkv_pattern, name):
- bid = re.findall(qkv_pattern, name)[0]
- qkv = data_torch
- qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim)
- q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :]
- # The model weights of q and k equire additional reshape.
- q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads)
- k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads)
- v = rearrange(v, " o g n i -> o (g n i)").T
- self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q)
- self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k)
- self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v)
- else:
- self.post_write_tensors(tensor_map, name, data_torch)
-
-
-@Model.register("BertModel", "CamembertModel")
-class BertModel(Model):
- model_arch = gguf.MODEL_ARCH.BERT
-
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- self.vocab_size = None
-
- def set_gguf_parameters(self):
- super().set_gguf_parameters()
- self.gguf_writer.add_causal_attention(False)
-
- # get pooling path
- pooling_path = None
- module_path = self.dir_model / "modules.json"
- if module_path.is_file():
- with open(module_path, encoding="utf-8") as f:
- modules = json.load(f)
- for mod in modules:
- if mod["type"] == "sentence_transformers.models.Pooling":
- pooling_path = mod["path"]
- break
-
- # get pooling type
- if pooling_path is not None:
- with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
- pooling = json.load(f)
- if pooling["pooling_mode_mean_tokens"]:
- pooling_type = gguf.PoolingType.MEAN
- elif pooling["pooling_mode_cls_token"]:
- pooling_type = gguf.PoolingType.CLS
- else:
- raise NotImplementedError("Only MEAN and CLS pooling types supported")
- self.gguf_writer.add_pooling_type(pooling_type)
-
- def set_vocab(self):
- tokens, toktypes = self.get_basic_vocab()
- self.vocab_size = len(tokens)
-
- # we need this to validate the size of the token_type embeddings
- # though currently we are passing all zeros to the token_type embeddings
- self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B"
-
- # convert to phantom space vocab
- def phantom(tok):
- if tok.startswith("[") and tok.endswith("]"):
- return tok
- if tok.startswith("##"):
- return tok[2:]
- return "\u2581" + tok
- tokens = list(map(phantom, tokens))
-
- # add vocab to gguf
- self.gguf_writer.add_tokenizer_model("bert")
- self.gguf_writer.add_token_list(tokens)
- self.gguf_writer.add_token_types(toktypes)
-
- # handle special tokens
- special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
- special_vocab.add_to_gguf(self.gguf_writer)
-
- def write_tensors(self):
- tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
- tensors = dict(self.get_tensors())
- for name, data_torch in tensors.items():
- # we are only using BERT for embeddings so we don't need the pooling layer
- if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"):
- continue # we don't need these
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- data = data_torch.squeeze().numpy()
- n_dims = len(data.shape)
- new_dtype: type[np.floating[Any]]
-
- if (
- self.ftype == 1 and name.endswith(".weight") and n_dims == 2
- and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32
- ):
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- new_dtype = np.float16
- else:
- # if f32 desired, convert any float16 to float32
- new_dtype = np.float32
-
- print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}")
-
- if data.dtype != new_dtype:
- data = data.astype(new_dtype)
-
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("NomicBertModel")
-class NomicBertModel(BertModel):
- model_arch = gguf.MODEL_ARCH.NOMIC_BERT
-
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
-
- # the HF config claims n_ctx=8192, but it uses RoPE scaling
- self.hparams["n_ctx"] = 2048
-
- # SwigLU activation
- assert self.hparams["activation_function"] == "swiglu"
- # this doesn't do anything in the HF version
- assert self.hparams["causal"] is False
- # no bias tensors
- assert self.hparams["qkv_proj_bias"] is False
- assert self.hparams["mlp_fc1_bias"] is False
- assert self.hparams["mlp_fc2_bias"] is False
- # norm at end of layer
- assert self.hparams["prenorm"] is False
- # standard RoPE
- assert self.hparams["rotary_emb_fraction"] == 1.0
- assert self.hparams["rotary_emb_interleaved"] is False
- assert self.hparams["rotary_emb_scale_base"] is None
-
- def set_gguf_parameters(self):
- super().set_gguf_parameters()
- self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
-
-
-@Model.register("GemmaForCausalLM")
-class GemmaModel(Model):
- model_arch = gguf.MODEL_ARCH.GEMMA
-
- def set_vocab(self):
- self._set_vocab_sentencepiece()
-
- # TODO: these special tokens should be exported only for the CodeGemma family
- special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
- special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
- special_vocab._set_special_token("prefix", 67)
- special_vocab._set_special_token("suffix", 69)
- special_vocab._set_special_token("middle", 68)
- special_vocab._set_special_token("fsep", 70)
- special_vocab._set_special_token("eot", 107)
- special_vocab.add_to_gguf(self.gguf_writer)
-
- def set_gguf_parameters(self):
- hparams = self.hparams
- block_count = hparams["num_hidden_layers"]
-
- self.gguf_writer.add_name(self.dir_model.name)
- self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
- self.gguf_writer.add_embedding_length(hparams["hidden_size"])
- self.gguf_writer.add_block_count(block_count)
- self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
- self.gguf_writer.add_head_count(hparams["num_attention_heads"])
- self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in hparams else hparams["num_attention_heads"])
- self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
- self.gguf_writer.add_key_length(hparams["head_dim"])
- self.gguf_writer.add_value_length(hparams["head_dim"])
- self.gguf_writer.add_file_type(self.ftype)
-
- def write_tensors(self):
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
- for name, data_torch in self.get_tensors():
- # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
- # To prevent errors, skip loading lm_head.weight.
- if name == "lm_head.weight":
- print(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.")
- continue
-
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
- if name.endswith("norm.weight"):
- data_torch = data_torch + 1
- data = data_torch.squeeze().numpy()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("Starcoder2ForCausalLM")
-class StarCoder2Model(Model):
- model_arch = gguf.MODEL_ARCH.STARCODER2
-
-
-@Model.register("MambaForCausalLM", "MambaLMHeadModel")
-class MambaModel(Model):
- model_arch = gguf.MODEL_ARCH.MAMBA
-
- def set_vocab(self):
- vocab_size = self.hparams["vocab_size"]
- # Round vocab size to next multiple of 8
- pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
- # pad using ceiling division
- # ref: https://stackoverflow.com/a/17511341/22827863
- vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
- self.hparams["vocab_size"] = vocab_size
-
- if (self.dir_model / "tokenizer.json").is_file():
- self._set_vocab_gpt2()
- else:
- # Use the GPT-NeoX tokenizer when no tokenizer files are present
- tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
- print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
- neox_reader = gguf.GGUFReader(tokenizer_path, "r")
-
- field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
- self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
-
- field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
- self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
-
- field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
- self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
-
- field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
- self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
-
- field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
- self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
-
- field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
- self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
-
- field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
- self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
-
- def set_gguf_parameters(self):
- d_model = self.find_hparam(["hidden_size", "d_model"])
- d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
- d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
- d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
- # ceiling division
- # ref: https://stackoverflow.com/a/17511341/22827863
- # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
- dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
- rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
-
- # Fail early for models which don't have a block expansion factor of 2
- assert d_inner == 2 * d_model
-
- self.gguf_writer.add_name(self.dir_model.name)
- self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
- self.gguf_writer.add_embedding_length(d_model)
- self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
- self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
- self.gguf_writer.add_block_count(self.hparams["n_layer"])
- self.gguf_writer.add_ssm_conv_kernel(d_conv)
- self.gguf_writer.add_ssm_inner_size(d_inner)
- self.gguf_writer.add_ssm_state_size(d_state)
- self.gguf_writer.add_ssm_time_step_rank(dt_rank)
- self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
- self.gguf_writer.add_file_type(self.ftype)
-
- def write_tensors(self):
- block_count = self.hparams["n_layer"]
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
- tok_embd = None
- tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight"
- output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight"
-
- for name, data_torch in self.get_tensors():
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- if name.endswith(".A_log"):
- print("A_log --> A ==> " + new_name)
- data_torch = -torch.exp(data_torch)
-
- # assuming token_embd.weight is seen before output.weight
- if tok_embd is not None and new_name == output_name:
- if torch.equal(tok_embd, data_torch):
- print(f"{output_name} is equivalent to {tok_embd_name}, omitting")
- continue
- if new_name == tok_embd_name:
- tok_embd = data_torch
-
- data = data_torch.squeeze().numpy()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert big float32 2-dim weight tensors to float16
- new_weight_name = new_name[:-len(".weight")] if new_name.endswith(".weight") else ""
- if self.ftype == 1 and data_dtype == np.float32 and new_weight_name.endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
-
-@Model.register("CohereForCausalLM")
-class CommandR2Model(Model):
- model_arch = gguf.MODEL_ARCH.COMMAND_R
-
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
-
- # max_position_embeddings = 8192 in config.json but model was actually
- # trained on 128k context length
- self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
-
- def set_gguf_parameters(self):
- super().set_gguf_parameters()
- self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
- self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
-
-
-@Model.register("OlmoForCausalLM")
-@Model.register("OLMoForCausalLM")
-class OlmoModel(Model):
- model_arch = gguf.MODEL_ARCH.OLMO
-
- def set_gguf_parameters(self):
- super().set_gguf_parameters()
- self.gguf_writer.add_layer_norm_eps(1e-5)
- if "clip_qkv" in self.hparams is not None:
- self.gguf_writer.add_clamp_kqv(self.hparams["clip_qkv"])
-
- # Same as super class, but permuting q_proj, k_proj
- # Copied from: LlamaModel
- def write_tensors(self):
- block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
- tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
- n_head = self.hparams.get("num_attention_heads")
- n_kv_head = self.hparams.get("num_key_value_heads")
- for name, data_torch in self.get_tensors():
- old_dtype = data_torch.dtype
-
- # convert any unsupported data types to float32
- if data_torch.dtype not in (torch.float16, torch.float32):
- data_torch = data_torch.to(torch.float32)
-
- data = data_torch.numpy()
-
- if name.endswith("q_proj.weight"):
- data = permute(data, n_head, n_head)
- if name.endswith("k_proj.weight"):
- data = permute(data, n_head, n_kv_head)
-
- data = data.squeeze()
-
- # map tensor names
- new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
- if new_name is None:
- print(f"Can not map tensor {name!r}")
- sys.exit()
-
- n_dims = len(data.shape)
- data_dtype = data.dtype
-
- # if f32 desired, convert any float16 to float32
- if self.ftype == 0 and data_dtype == np.float16:
- data = data.astype(np.float32)
-
- # 1d tensors need to be converted to float32
- if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
- data = data.astype(np.float32)
-
- # if f16 desired, convert any float32 2-dim weight tensors to float16
- if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
- data = data.astype(np.float16)
-
- print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
- self.gguf_writer.add_tensor(new_name, data)
-
-
-###### CONVERSION LOGIC ######
-
-
-def parse_args() -> argparse.Namespace:
- parser = argparse.ArgumentParser(
- description="Convert a huggingface model to a GGML compatible file")
- parser.add_argument(
- "--vocab-only", action="store_true",
- help="extract only the vocab",
- )
- parser.add_argument(
- "--awq-path", type=Path, default=None,
- help="Path to scale awq cache file")
- parser.add_argument(
- "--outfile", type=Path,
- help="path to write to; default: based on input",
- )
- parser.add_argument(
- "--outtype", type=str, choices=["f32", "f16"], default="f16",
- help="output format - use f32 for float32, f16 for float16",
- )
- parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
- parser.add_argument(
- "model", type=Path,
- help="directory containing model file",
- )
- parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
-
- return parser.parse_args()
-
-
-def main() -> None:
- args = parse_args()
-
- dir_model = args.model
-
- if args.awq_path:
- sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
- from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
- tmp_model_path = args.model / "weighted_model"
- dir_model = tmp_model_path
- if tmp_model_path.is_dir():
- print(f"{tmp_model_path} exists as a weighted model.")
- else:
- tmp_model_path.mkdir(parents=True, exist_ok=True)
- print("Saving new weighted model ...")
- add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
- print(f"Saved weighted model at {tmp_model_path}.")
-
- if not dir_model.is_dir():
- print(f'Error: {args.model} is not a directory', file=sys.stderr)
- sys.exit(1)
-
- ftype_map = {
- "f32": gguf.GGMLQuantizationType.F32,
- "f16": gguf.GGMLQuantizationType.F16,
- }
-
- if args.outfile is not None:
- fname_out = args.outfile
- else:
- # output in the same directory as the model by default
- fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
-
- print(f"Loading model: {dir_model.name}")
-
- hparams = Model.load_hparams(dir_model)
-
- with torch.inference_mode():
- model_class = Model.from_model_architecture(hparams["architectures"][0])
- model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file)
-
- print("Set model parameters")
- model_instance.set_gguf_parameters()
-
- print("Set model tokenizer")
- model_instance.set_vocab()
-
- if args.vocab_only:
- print(f"Exporting model vocab to '{fname_out}'")
- model_instance.write_vocab()
- else:
- print(f"Exporting model to '{fname_out}'")
- model_instance.write()
-
- print(f"Model successfully exported to '{fname_out}'")
-
-
-if __name__ == '__main__':
- main()
diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py
deleted file mode 100755
index cd9644fcb52..00000000000
--- a/convert-llama-ggml-to-gguf.py
+++ /dev/null
@@ -1,441 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import os
-import struct
-import sys
-from enum import IntEnum
-from pathlib import Path
-
-import numpy as np
-
-if 'NO_LOCAL_GGUF' not in os.environ:
- sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-
-class GGMLFormat(IntEnum):
- GGML = 0
- GGMF = 1
- GGJT = 2
-
-
-class GGMLFType(IntEnum):
- ALL_F32 = 0
- MOSTLY_F16 = 1
- MOSTLY_Q4_0 = 2
- MOSTLY_Q4_1 = 3
- MOSTLY_Q4_1_SOME_F16 = 4
- MOSTLY_Q8_0 = 7
- MOSTLY_Q5_0 = 8
- MOSTLY_Q5_1 = 9
- MOSTLY_Q2_K = 10
- MOSTLY_Q3_K_S = 11
- MOSTLY_Q3_K_M = 12
- MOSTLY_Q3_K_L = 13
- MOSTLY_Q4_K_S = 14
- MOSTLY_Q4_K_M = 15
- MOSTLY_Q5_K_S = 16
- MOSTLY_Q5_K_M = 17
- MOSTLY_Q6_K = 18
-
-
-class Hyperparameters:
- def __init__(self):
- self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
- self.n_layer = self.n_rot = self.n_ff = 0
- self.ftype = GGMLFType.ALL_F32
-
- def set_n_ff(self, model):
- ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
- assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
- ff_tensor = model.tensors[ff_tensor_idx]
- self.n_ff = ff_tensor.dims[1]
-
- def load(self, data, offset):
- (
- self.n_vocab,
- self.n_embd,
- self.n_mult,
- self.n_head,
- self.n_layer,
- self.n_rot,
- ftype,
- ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
- try:
- self.ftype = GGMLFType(ftype)
- except ValueError:
- raise ValueError(f'Invalid ftype {ftype}')
- return 4 * 7
-
- def __str__(self):
- return f''
-
-
-class Vocab:
- def __init__(self, load_scores = True):
- self.items = []
- self.load_scores = load_scores
-
- def load(self, data, offset, n_vocab):
- orig_offset = offset
- for _ in range(n_vocab):
- itemlen = struct.unpack('= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
- assert name_len < 4096, 'Absurd tensor name length'
- quant = gguf.GGML_QUANT_SIZES.get(dtype)
- assert quant is not None, 'Unknown tensor type'
- (blksize, tysize) = quant
- offset += 12
- self.dtype= dtype
- self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
- offset += 4 * n_dims
- self.name = bytes(data[offset:offset + name_len])
- offset += name_len
- pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
- offset += pad
- n_elems = np.prod(self.dims)
- n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
- self.start_offset = offset
- self.len_bytes = n_bytes
- offset += n_bytes
- # print(n_dims, name_len, dtype, self.dims, self.name, pad)
- return offset - orig_offset
-
-
-class GGMLModel:
- def __init__(self):
- self.hyperparameters = None
- self.vocab = None
- self.tensor_map = {}
- self.tensors = []
-
- def validate_header(self, data, offset):
- magic = bytes(data[offset:offset + 4])
- if magic == b'GGUF':
- raise ValueError('File is already in GGUF format.')
- if magic == b'lmgg':
- self.file_format = GGMLFormat.GGML
- self.format_version = 1
- return 4
- version = struct.unpack(' 3:
- raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
- self.file_format = GGMLFormat.GGJT
- self.format_version = version
- return 8
- raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
-
- def validate_conversion(self, ftype):
- err = ''
- if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
- if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
- err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
- elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
- if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
- GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
- err = 'Q4 and Q8 quantizations changed in GGJTv3.'
- if len(err) > 0:
- raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
-
- def load(self, data, offset):
- offset += self.validate_header(data, offset)
- hp = Hyperparameters()
- offset += hp.load(data, offset)
- print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
- self.validate_conversion(hp.ftype)
- vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
- offset += vocab.load(data, offset, hp.n_vocab)
- tensors: list[Tensor] = []
- tensor_map = {}
- while offset < len(data):
- tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
- offset += tensor.load(data, offset)
- tensor_map[tensor.name] = len(tensors)
- tensors.append(tensor)
- self.hyperparameters = hp
- self.vocab = vocab
- self.tensors = tensors
- self.tensor_map = tensor_map
- hp.set_n_ff(self)
- return offset
-
-
-class GGMLToGGUF:
- def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
- hp = ggml_model.hyperparameters
- self.model = ggml_model
- self.data = data
- self.cfg = cfg
- self.params_override = params_override
- self.vocab_override = vocab_override
- self.special_vocab = special_vocab
- if params_override is not None:
- n_kv_head = params_override.n_head_kv
- else:
- if cfg.gqa == 1:
- n_kv_head = hp.n_head
- else:
- gqa = float(cfg.gqa)
- n_kv_head = None
- for x in range(1, 256):
- if float(hp.n_head) / float(x) == gqa:
- n_kv_head = x
- assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
- print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
- self.n_kv_head = n_kv_head
- self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
-
- def save(self):
- print('* Preparing to save GGUF file')
- gguf_writer = gguf.GGUFWriter(
- self.cfg.output,
- gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
- use_temp_file = False)
- self.add_params(gguf_writer)
- self.add_vocab(gguf_writer)
- if self.special_vocab is not None:
- self.special_vocab.add_to_gguf(gguf_writer)
- self.add_tensors(gguf_writer)
- print(" gguf: write header")
- gguf_writer.write_header_to_file()
- print(" gguf: write metadata")
- gguf_writer.write_kv_data_to_file()
- print(" gguf: write tensors")
- gguf_writer.write_tensors_to_file()
- gguf_writer.close()
-
- def add_params(self, gguf_writer):
- hp = self.model.hyperparameters
- cfg = self.cfg
- if cfg.desc is not None:
- desc = cfg.desc
- else:
- desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
- try:
- # Filenames aren't necessarily valid UTF8.
- name = cfg.name if cfg.name is not None else cfg.input.name
- except UnicodeDecodeError:
- name = None
- print('* Adding model parameters and KV items')
- if name is not None:
- gguf_writer.add_name(name)
- gguf_writer.add_description(desc)
- gguf_writer.add_file_type(int(hp.ftype))
- if self.params_override is not None:
- po = self.params_override
- assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
- assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
- assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
- gguf_writer.add_context_length (po.n_ctx)
- gguf_writer.add_embedding_length (po.n_embd)
- gguf_writer.add_block_count (po.n_layer)
- gguf_writer.add_feed_forward_length (po.n_ff)
- gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
- gguf_writer.add_head_count (po.n_head)
- gguf_writer.add_head_count_kv (po.n_head_kv)
- gguf_writer.add_layer_norm_rms_eps (po.f_norm_eps)
- return
- gguf_writer.add_context_length(cfg.context_length)
- gguf_writer.add_embedding_length(hp.n_embd)
- gguf_writer.add_block_count(hp.n_layer)
- gguf_writer.add_feed_forward_length(hp.n_ff)
- gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
- gguf_writer.add_head_count(hp.n_head)
- gguf_writer.add_head_count_kv(self.n_kv_head)
- gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
-
- def add_vocab(self, gguf_writer):
- hp = self.model.hyperparameters
- gguf_writer.add_tokenizer_model('llama')
- tokens = []
- scores = []
- toktypes = []
- if self.vocab_override is not None:
- vo = self.vocab_override
- print('* Adding vocab item(s)')
- for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
- tokens.append(vbytes)
- scores.append(score)
- toktypes.append(ttype)
- assert len(tokens) == hp.n_vocab, \
- f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
- gguf_writer.add_token_list(tokens)
- gguf_writer.add_token_scores(scores)
- if len(toktypes) > 0:
- gguf_writer.add_token_types(toktypes)
- return
- print(f'* Adding {hp.n_vocab} vocab item(s)')
- assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
- for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
- tt = 1 # Normal
- # Special handling for UNK, BOS, EOS tokens.
- if tokid <= 2:
- if tokid == 0:
- vbytes = b''
- tt = 2
- elif tokid == 1:
- vbytes = b''
- tt = 3
- else:
- vbytes = b''
- tt = 3
- elif len(vbytes) == 0:
- tt = 3 # Control
- elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
- vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
- tt = 6 # Byte
- else:
- vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
- toktypes.append(tt)
- tokens.append(vbytes)
- scores.append(vscore)
- gguf_writer.add_token_list(tokens)
- gguf_writer.add_token_scores(scores)
- gguf_writer.add_token_types(toktypes)
- gguf_writer.add_unk_token_id(0)
- gguf_writer.add_bos_token_id(1)
- gguf_writer.add_eos_token_id(2)
-
- def add_tensors(self, gguf_writer):
- tensor_map = self.name_map
- data = self.data
- print(f'* Adding {len(self.model.tensors)} tensor(s)')
- for tensor in self.model.tensors:
- name = str(tensor.name, 'UTF-8')
- mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
- assert mapped_name is not None, f'Bad name {name}'
- tempdims = list(tensor.dims[:])
- if len(tempdims) > 1:
- temp = tempdims[1]
- tempdims[1] = tempdims[0]
- tempdims[0] = temp
- # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
- gguf_writer.add_tensor(
- mapped_name,
- data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
- raw_shape = tempdims,
- raw_dtype = tensor.dtype)
-
-
-def handle_metadata(cfg, hp):
- import convert
- assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
- hf_config_path = cfg.model_metadata_dir / "config.json"
- orig_config_path = cfg.model_metadata_dir / "params.json"
- # We pass a fake model here. "original" mode will check the shapes of some
- # tensors if information is missing in the .json file: other than that, the
- # model data isn't used so this should be safe (at least for now).
- fakemodel = {
- 'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
- 'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
- }
- fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
- fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
- if hf_config_path.exists():
- params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
- elif orig_config_path.exists():
- params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
- else:
- raise ValueError('Unable to load metadata')
- vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
- vocab_factory = convert.VocabFactory(vocab_path)
- vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
- convert.check_vocab_size(params, vocab)
- return params, vocab, special_vocab
-
-
-def handle_args():
- parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
- parser.add_argument('--input', '-i', type = Path, required = True,
- help = 'Input GGMLv3 filename')
- parser.add_argument('--output', '-o', type = Path, required = True,
- help ='Output GGUF filename')
- parser.add_argument('--name',
- help = 'Set model name')
- parser.add_argument('--desc',
- help = 'Set model description')
- parser.add_argument('--gqa', type = int, default = 1,
- help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
- parser.add_argument('--eps', default = '5.0e-06',
- help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
- parser.add_argument('--context-length', '-c', type=int, default = 2048,
- help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
- parser.add_argument('--model-metadata-dir', '-m', type = Path,
- help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
- parser.add_argument("--vocab-dir", type=Path,
- help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
- parser.add_argument("--vocabtype", default="spm,hfft",
- help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
- return parser.parse_args()
-
-
-def main():
- cfg = handle_args()
- print(f'* Using config: {cfg}')
- print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
- if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
- print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
- data = np.memmap(cfg.input, mode = 'r')
- model = GGMLModel()
- print('* Scanning GGML input file')
- offset = model.load(data, 0) # noqa
- print(f'* GGML model hyperparameters: {model.hyperparameters}')
- vocab_override = None
- params_override = None
- special_vocab = None
- if cfg.model_metadata_dir is not None:
- (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
- print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
- print(f'* Overriding params: {params_override}')
- print(f'* Overriding vocab: {vocab_override}')
- print(f'* Special vocab: {special_vocab}')
- else:
- print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
- if model.file_format == GGMLFormat.GGML:
- print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
- converter = GGMLToGGUF(
- model, data, cfg,
- params_override = params_override,
- vocab_override = vocab_override,
- special_vocab = special_vocab
- )
- converter.save()
- print(f'* Successful completion. Output saved to: {cfg.output}')
-
-
-if __name__ == '__main__':
- main()
diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
deleted file mode 100755
index 9a9936dec8b..00000000000
--- a/convert-lora-to-ggml.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import json
-import os
-import struct
-import sys
-from pathlib import Path
-from typing import Any, BinaryIO, Sequence
-
-import numpy as np
-import torch
-
-if 'NO_LOCAL_GGUF' not in os.environ:
- sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
-
-
-def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
- fout.write(b"ggla"[::-1]) # magic (ggml lora)
- fout.write(struct.pack("i", 1)) # file version
- fout.write(struct.pack("i", params["r"]))
- # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
- # but some models ship a float value instead
- # let's convert to int, but fail if lossless conversion is not possible
- assert (
- int(params["lora_alpha"]) == params["lora_alpha"]
- ), "cannot convert float to int losslessly"
- fout.write(struct.pack("i", int(params["lora_alpha"])))
-
-
-def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
- sname = name.encode("utf-8")
- fout.write(
- struct.pack(
- "iii",
- len(shape),
- len(sname),
- NUMPY_TYPE_TO_FTYPE[data_type.name],
- )
- )
- fout.write(struct.pack("i" * len(shape), *shape[::-1]))
- fout.write(sname)
- fout.seek((fout.tell() + 31) & -32)
-
-
-if __name__ == '__main__':
- if len(sys.argv) < 2:
- print(f"Usage: python {sys.argv[0]} [arch]")
- print(
- "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
- )
- print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
- sys.exit(1)
-
- input_json = os.path.join(sys.argv[1], "adapter_config.json")
- input_model = os.path.join(sys.argv[1], "adapter_model.bin")
- output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
-
- if os.path.exists(input_model):
- model = torch.load(input_model, map_location="cpu")
- else:
- input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
- # lazy import load_file only if lora is in safetensors format.
- from safetensors.torch import load_file
- model = load_file(input_model, device="cpu")
-
- arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
-
- if arch_name not in gguf.MODEL_ARCH_NAMES.values():
- print(f"Error: unsupported architecture {arch_name}")
- sys.exit(1)
-
- arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
- name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
-
- with open(input_json, "r") as f:
- params = json.load(f)
-
- if params["peft_type"] != "LORA":
- print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
- sys.exit(1)
-
- if params["fan_in_fan_out"] is True:
- print("Error: param fan_in_fan_out is not supported")
- sys.exit(1)
-
- if params["bias"] is not None and params["bias"] != "none":
- print("Error: param bias is not supported")
- sys.exit(1)
-
- # TODO: these seem to be layers that have been trained but without lora.
- # doesn't seem widely used but eventually should be supported
- if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
- print("Error: param modules_to_save is not supported")
- sys.exit(1)
-
- with open(output_path, "wb") as fout:
- fout.truncate()
-
- write_file_header(fout, params)
- for k, v in model.items():
- orig_k = k
- if k.endswith(".default.weight"):
- k = k.replace(".default.weight", ".weight")
- if k in ["llama_proj.weight", "llama_proj.bias"]:
- continue
- if k.endswith("lora_A.weight"):
- if v.dtype != torch.float16 and v.dtype != torch.float32:
- v = v.float()
- v = v.T
- else:
- v = v.float()
-
- t = v.detach().numpy()
-
- prefix = "base_model.model."
- if k.startswith(prefix):
- k = k[len(prefix) :]
-
- lora_suffixes = (".lora_A.weight", ".lora_B.weight")
- if k.endswith(lora_suffixes):
- suffix = k[-len(lora_suffixes[0]):]
- k = k[: -len(lora_suffixes[0])]
- else:
- print(f"Error: unrecognized tensor name {orig_k}")
- sys.exit(1)
-
- tname = name_map.get_name(k)
- if tname is None:
- print(f"Error: could not map tensor name {orig_k}")
- print(" Note: the arch parameter must be specified if the model is not llama")
- sys.exit(1)
-
- if suffix == ".lora_A.weight":
- tname += ".weight.loraA"
- elif suffix == ".lora_B.weight":
- tname += ".weight.loraB"
- else:
- assert False
-
- print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
- write_tensor_header(fout, tname, t.shape, t.dtype)
- t.tofile(fout)
-
- print(f"Converted {input_json} and {input_model} to {output_path}")
diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py
deleted file mode 100755
index 69be17f94ef..00000000000
--- a/convert-persimmon-to-gguf.py
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import os
-import sys
-from pathlib import Path
-from pprint import pprint
-
-import torch
-from sentencepiece import SentencePieceProcessor
-
-if 'NO_LOCAL_GGUF' not in os.environ:
- sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-
-def _flatten_dict(dct, tensors, prefix=None):
- assert isinstance(dct, dict)
- for key in dct.keys():
- new_prefix = prefix + '.' + key if prefix is not None else key
- if isinstance(dct[key], torch.Tensor):
- tensors[new_prefix] = dct[key]
- elif isinstance(dct[key], dict):
- _flatten_dict(dct[key], tensors, new_prefix)
- else:
- raise ValueError(type(dct[key]))
- return None
-
-
-def _get_sentencepiece_tokenizer_info(dir_model: Path):
- tokenizer_path = dir_model / 'adept_vocab.model'
- print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
- tokenizer = SentencePieceProcessor(str(tokenizer_path))
- print('gguf: adding tokens')
- tokens: list[bytes] = []
- scores: list[float] = []
- toktypes: list[int] = []
-
- for i in range(tokenizer.vocab_size()):
- text: bytes
- score: float
-
- piece = tokenizer.id_to_piece(i)
- text = piece.encode("utf-8")
- score = tokenizer.get_score(i)
-
- toktype = 1
- if tokenizer.is_unknown(i):
- toktype = 2
- if tokenizer.is_control(i):
- toktype = 3
- if tokenizer.is_unused(i):
- toktype = 5
- if tokenizer.is_byte(i):
- toktype = 6
-
- tokens.append(text)
- scores.append(score)
- toktypes.append(toktype)
- pass
- return tokens, scores, toktypes
-
-
-def main():
- parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
- parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
- parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
- parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
- parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
- args = parser.parse_args()
- sys.path.append(str(args.adept_inference_dir))
- persimmon_model = torch.load(args.ckpt_path)
- hparams = persimmon_model['args']
- pprint(hparams)
- tensors: dict[str, torch.Tensor] = {}
- _flatten_dict(persimmon_model['model'], tensors, None)
-
- arch = gguf.MODEL_ARCH.PERSIMMON
- gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
-
- block_count = hparams.num_layers
- head_count = hparams.num_attention_heads
- head_count_kv = head_count
- ctx_length = hparams.seq_length
- hidden_size = hparams.hidden_size
-
- gguf_writer.add_name('persimmon-8b-chat')
- gguf_writer.add_context_length(ctx_length)
- gguf_writer.add_embedding_length(hidden_size)
- gguf_writer.add_block_count(block_count)
- gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
- # ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
- gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
- gguf_writer.add_head_count(head_count)
- gguf_writer.add_head_count_kv(head_count_kv)
- gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
- gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
-
- tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
- gguf_writer.add_tokenizer_model('llama')
- gguf_writer.add_token_list(tokens)
- gguf_writer.add_token_scores(scores)
- gguf_writer.add_token_types(toktypes)
- gguf_writer.add_bos_token_id(71013)
- gguf_writer.add_eos_token_id(71013)
-
- tensor_map = gguf.get_tensor_name_map(arch, block_count)
- print(tensor_map)
- for name in tensors.keys():
- data_torch = tensors[name]
- if name.endswith(".self_attention.rotary_emb.inv_freq"):
- continue
- old_dtype = data_torch.dtype
- # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
- data = data_torch.to(torch.float32).squeeze().numpy()
- new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
- if new_name is None:
- print("Can not map tensor '" + name + "'")
- sys.exit()
- n_dims = len(data.shape)
- print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
- gguf_writer.add_tensor(new_name, data)
- print("gguf: write header")
- gguf_writer.write_header_to_file()
- print("gguf: write metadata")
- gguf_writer.write_kv_data_to_file()
- print("gguf: write tensors")
- gguf_writer.write_tensors_to_file()
-
- gguf_writer.close()
-
- print(f"gguf: model successfully exported to '{args.outfile}'")
- print("")
-
-
-if __name__ == '__main__':
- main()
diff --git a/convert.py b/convert.py
deleted file mode 100755
index 1c700cf6a3d..00000000000
--- a/convert.py
+++ /dev/null
@@ -1,1555 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import concurrent.futures
-import enum
-import faulthandler
-import functools
-import itertools
-import json
-import math
-import mmap
-import os
-import pickle
-import re
-import signal
-import struct
-import sys
-import textwrap
-import time
-import zipfile
-from abc import ABC, abstractmethod
-from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
-from dataclasses import dataclass
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
-
-import numpy as np
-from sentencepiece import SentencePieceProcessor
-
-if 'NO_LOCAL_GGUF' not in os.environ:
- sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-if TYPE_CHECKING:
- from typing_extensions import Self, TypeAlias
-
-if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
- faulthandler.register(signal.SIGUSR1)
-
-NDArray: TypeAlias = 'np.ndarray[Any, Any]'
-
-ARCH = gguf.MODEL_ARCH.LLAMA
-
-DEFAULT_CONCURRENCY = 8
-
-ADDED_TOKENS_FILE = 'added_tokens.json'
-FAST_TOKENIZER_FILE = 'tokenizer.json'
-
-#
-# data types
-#
-
-
-@dataclass(frozen=True)
-class DataType:
- name: str
- dtype: np.dtype[Any]
- valid_conversions: list[str]
-
- def elements_to_bytes(self, n_elements: int) -> int:
- return n_elements * self.dtype.itemsize
-
-
-@dataclass(frozen=True)
-class UnquantizedDataType(DataType):
- pass
-
-
-DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
-DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
-DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
-DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
-
-
-@dataclass(frozen=True)
-class QuantizedDataType(DataType):
- block_size: int
- quantized_dtype: np.dtype[Any]
- ggml_type: gguf.GGMLQuantizationType
-
- def quantize(self, arr: NDArray) -> NDArray:
- raise NotImplementedError(f'Quantization for {self.name} not implemented')
-
- def elements_to_bytes(self, n_elements: int) -> int:
- assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
- return self.quantized_dtype.itemsize * (n_elements // self.block_size)
-
-
-@dataclass(frozen=True)
-class Q8_0QuantizedDataType(QuantizedDataType):
- # Mini Q8_0 quantization in Python!
- def quantize(self, arr: NDArray) -> NDArray:
- assert arr.size % self.block_size == 0 and arr.size != 0, f'Bad array size {arr.size}'
- assert arr.dtype == np.float32, f'Bad array type {arr.dtype}'
- n_blocks = arr.size // self.block_size
- blocks = arr.reshape((n_blocks, self.block_size))
- # Much faster implementation of block quantization contributed by @Cebtenzzre
-
- def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
- d = abs(blocks).max(axis = 1) / np.float32(127)
- with np.errstate(divide = 'ignore'):
- qs = (blocks / d[:, None]).round()
- qs[d == 0] = 0
- yield from zip(d, qs)
- return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
-
-
-DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
- dtype = np.dtype(np.float32), valid_conversions = [],
- ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
- quantized_dtype = np.dtype([('d', ' DataType:
- dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
- if dt is None:
- raise ValueError(self)
- # Convert all 1D tensors to F32. Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32.
- # Also The 1d tensors aren't much of a performance/size issue. So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now.
- return dt if len(tensor.shape) > 1 else DT_F32
-
-
-GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
- GGMLFileType.AllF32 : DT_F32,
- GGMLFileType.MostlyF16 : DT_F16,
- GGMLFileType.MostlyQ8_0: DT_Q8_0,
-}
-
-#
-# hparams loading
-#
-
-
-@dataclass
-class Params:
- n_vocab: int
- n_embd: int
- n_layer: int
- n_ctx: int
- n_ff: int
- n_head: int
- n_head_kv: int
- n_experts: int | None = None
- n_experts_used: int | None = None
- f_norm_eps: float | None = None
-
- rope_scaling_type: gguf.RopeScalingType | None = None
- f_rope_freq_base: float | None = None
- f_rope_scale: float | None = None
- n_orig_ctx: int | None = None
- rope_finetuned: bool | None = None
-
- ftype: GGMLFileType | None = None
-
- # path to the directory containing the model files
- path_model: Path | None = None
-
- @staticmethod
- def guessed(model: LazyModel) -> Params:
- # try transformer naming first
- n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
-
- # try transformer naming first
- if "model.layers.0.self_attn.q_proj.weight" in model:
- n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
- elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
- n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
- else:
- n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
-
- if n_layer < 1:
- msg = """\
- failed to guess 'n_layer'. This model is unknown or unsupported.
- Suggestion: provide 'config.json' of the model in the same directory containing model files."""
- raise KeyError(textwrap.dedent(msg))
-
- n_head = n_embd // 128 # guessed
- n_mult = 256 # guessed
-
- # TODO: verify this
- n_ff = int(2 * (4 * n_embd) / 3)
- n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
-
- return Params(
- n_vocab = n_vocab,
- n_embd = n_embd,
- n_layer = n_layer,
- n_ctx = -1,
- n_ff = n_ff,
- n_head = n_head,
- n_head_kv = n_head,
- f_norm_eps = 1e-5,
- )
-
- @staticmethod
- def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
- with open(config_path) as f:
- config = json.load(f)
-
- rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
- rope_scaling = config.get("rope_scaling")
-
- if rope_scaling is not None and (typ := rope_scaling.get("type")):
- rope_factor = rope_scaling.get("factor")
- f_rope_scale = rope_factor
- if typ == "linear":
- rope_scaling_type = gguf.RopeScalingType.LINEAR
- elif typ == "yarn":
- rope_scaling_type = gguf.RopeScalingType.YARN
- n_orig_ctx = rope_scaling['original_max_position_embeddings']
- rope_finetuned = rope_scaling['finetuned']
- else:
- raise NotImplementedError(f'Unknown rope scaling type: {typ}')
-
- if "max_sequence_length" in config:
- n_ctx = config["max_sequence_length"]
- elif "max_position_embeddings" in config:
- n_ctx = config["max_position_embeddings"]
- else:
- msg = """\
- failed to guess 'n_ctx'. This model is unknown or unsupported.
- Suggestion: provide 'config.json' of the model in the same directory containing model files."""
- raise KeyError(textwrap.dedent(msg))
-
- n_experts = None
- n_experts_used = None
-
- if "num_local_experts" in config:
- n_experts = config["num_local_experts"]
- n_experts_used = config["num_experts_per_tok"]
-
- return Params(
- n_vocab = config["vocab_size"],
- n_embd = config["hidden_size"],
- n_layer = config["num_hidden_layers"],
- n_ctx = n_ctx,
- n_ff = config["intermediate_size"],
- n_head = (n_head := config["num_attention_heads"]),
- n_head_kv = config.get("num_key_value_heads", n_head),
- n_experts = n_experts,
- n_experts_used = n_experts_used,
- f_norm_eps = config["rms_norm_eps"],
- f_rope_freq_base = config.get("rope_theta"),
- rope_scaling_type = rope_scaling_type,
- f_rope_scale = f_rope_scale,
- n_orig_ctx = n_orig_ctx,
- rope_finetuned = rope_finetuned,
- )
-
- # LLaMA v2 70B params.json
- # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
- @staticmethod
- def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
- with open(config_path) as f:
- config = json.load(f)
-
- n_experts = None
- n_experts_used = None
- f_rope_freq_base = None
-
- # hack to determine LLaMA v1 vs v2 vs CodeLlama
- if config.get("moe"):
- # Mixtral
- n_ctx = 32768
- elif config.get("rope_theta") == 1000000:
- # CodeLlama
- n_ctx = 16384
- elif config["norm_eps"] == 1e-05:
- # LLaMA v2
- n_ctx = 4096
- else:
- # LLaMA v1
- n_ctx = 2048
-
- if "layers.0.feed_forward.w1.weight" in model:
- n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
-
- if config.get("moe"):
- n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
- n_experts = config["moe"]["num_experts"]
- n_experts_used = config["moe"]["num_experts_per_tok"]
- f_rope_freq_base = 1e6
-
- return Params(
- n_vocab = model["tok_embeddings.weight"].shape[0],
- n_embd = config["dim"],
- n_layer = config["n_layers"],
- n_ctx = n_ctx,
- n_ff = n_ff,
- n_head = (n_head := config["n_heads"]),
- n_head_kv = config.get("n_kv_heads", n_head),
- n_experts = n_experts,
- n_experts_used = n_experts_used,
- f_norm_eps = config["norm_eps"],
- f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
- )
-
- @staticmethod
- def load(model_plus: ModelPlus) -> Params:
- hf_config_path = model_plus.paths[0].parent / "config.json"
- orig_config_path = model_plus.paths[0].parent / "params.json"
-
- if hf_config_path.exists():
- params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
- elif orig_config_path.exists():
- params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
- elif model_plus.format != 'none':
- params = Params.guessed(model_plus.model)
- else:
- raise ValueError('Cannot guess params when model format is none')
-
- params.path_model = model_plus.paths[0].parent
-
- return params
-
-
-#
-# vocab
-#
-
-@runtime_checkable
-class BaseVocab(Protocol):
- tokenizer_model: ClassVar[str]
- name: ClassVar[str]
-
-
-class NoVocab(BaseVocab):
- tokenizer_model = "no_vocab"
- name = "no_vocab"
-
- def __repr__(self) -> str:
- return ""
-
-
-@runtime_checkable
-class Vocab(BaseVocab, Protocol):
- vocab_size: int
- added_tokens_dict: dict[str, int]
- added_tokens_list: list[str]
- fname_tokenizer: Path
-
- def __init__(self, base_path: Path): ...
- def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
-
-
-class BpeVocab(Vocab):
- tokenizer_model = "gpt2"
- name = "bpe"
-
- def __init__(self, base_path: Path):
- added_tokens: dict[str, int] = {}
-
- if (fname_tokenizer := base_path / 'vocab.json').exists():
- # "slow" tokenizer
- with open(fname_tokenizer, encoding="utf-8") as f:
- self.vocab = json.load(f)
-
- try:
- # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
- with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
- added_tokens = json.load(f)
- except FileNotFoundError:
- pass
- else:
- # "fast" tokenizer
- fname_tokenizer = base_path / FAST_TOKENIZER_FILE
-
- # if this fails, FileNotFoundError propagates to caller
- with open(fname_tokenizer, encoding="utf-8") as f:
- tokenizer_json = json.load(f)
-
- tokenizer_model: dict[str, Any] = tokenizer_json['model']
- if (
- tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
- or tokenizer_json['decoder']['type'] != 'ByteLevel'
- ):
- raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
-
- self.vocab = tokenizer_model["vocab"]
-
- if (added := tokenizer_json.get('added_tokens')) is not None:
- # Added tokens here can be duplicates of the main vocabulary.
- added_tokens = {item['content']: item['id']
- for item in added
- if item['content'] not in self.vocab}
-
- vocab_size = len(self.vocab)
- expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
- actual_ids = sorted(added_tokens.values())
- if expected_ids != actual_ids:
- expected_end_id = vocab_size + len(actual_ids) - 1
- raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
- f"{vocab_size} - {expected_end_id}; got {actual_ids}")
-
- items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
- self.added_tokens_dict = added_tokens
- self.added_tokens_list = [text for (text, idx) in items]
- self.vocab_size_base = vocab_size
- self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
- self.fname_tokenizer = fname_tokenizer
-
- def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
-
- for i, _ in enumerate(self.vocab):
- yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
-
- def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- for text in self.added_tokens_list:
- score = -1000.0
- yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
-
- def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- yield from self.bpe_tokens()
- yield from self.added_tokens()
-
- def __repr__(self) -> str:
- return f""
-
-
-class SentencePieceVocab(Vocab):
- tokenizer_model = "llama"
- name = "spm"
-
- def __init__(self, base_path: Path):
- added_tokens: dict[str, int] = {}
- if (fname_tokenizer := base_path / 'tokenizer.model').exists():
- # normal location
- try:
- with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
- added_tokens = json.load(f)
- except FileNotFoundError:
- pass
- elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
- # not found in alternate location either
- raise FileNotFoundError('Cannot find tokenizer.model')
-
- self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
- vocab_size = self.sentencepiece_tokenizer.vocab_size()
-
- new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
- expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
- actual_new_ids = sorted(new_tokens.keys())
-
- if expected_new_ids != actual_new_ids:
- raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
-
- # Token pieces that were added to the base vocabulary.
- self.added_tokens_dict = added_tokens
- self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
- self.vocab_size_base = vocab_size
- self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
- self.fname_tokenizer = fname_tokenizer
-
- def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- tokenizer = self.sentencepiece_tokenizer
- for i in range(tokenizer.vocab_size()):
- piece = tokenizer.id_to_piece(i)
- text = piece.encode("utf-8")
- score: float = tokenizer.get_score(i)
-
- toktype = gguf.TokenType.NORMAL
- if tokenizer.is_unknown(i):
- toktype = gguf.TokenType.UNKNOWN
- if tokenizer.is_control(i):
- toktype = gguf.TokenType.CONTROL
-
- # NOTE: I think added_tokens are user defined.
- # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
- # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
-
- if tokenizer.is_unused(i):
- toktype = gguf.TokenType.UNUSED
- if tokenizer.is_byte(i):
- toktype = gguf.TokenType.BYTE
-
- yield text, score, toktype
-
- def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- for text in self.added_tokens_list:
- score = -1000.0
- yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
-
- def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- yield from self.sentencepiece_tokens()
- yield from self.added_tokens()
-
- def __repr__(self) -> str:
- return f""
-
-
-class LlamaHfVocab(Vocab):
- tokenizer_model = "llama"
- name = "hfft"
-
- def __init__(self, base_path: Path):
- fname_tokenizer = base_path / FAST_TOKENIZER_FILE
- # if this fails, FileNotFoundError propagates to caller
- with open(fname_tokenizer, encoding='utf-8') as f:
- tokenizer_json = json.load(f)
-
- # pre-check so we know if we need transformers
- tokenizer_model: dict[str, Any] = tokenizer_json['model']
- is_llama3 = (
- tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
- and not tokenizer_model.get('byte_fallback', True)
- )
- if is_llama3:
- raise TypeError('Llama 3 must be converted with BpeVocab')
-
- if not is_llama3 and (
- tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
- or tokenizer_json['decoder']['type'] != 'Sequence'
- ):
- raise FileNotFoundError('Cannot find Llama BPE tokenizer')
-
- try:
- from transformers import AutoTokenizer
- except ImportError as e:
- raise ImportError(
- "To use LlamaHfVocab, please install the `transformers` package. "
- "You can install it with `pip install transformers`."
- ) from e
-
- # Allow the tokenizer to default to slow or fast versions.
- # Explicitly set tokenizer to use local paths.
- self.tokenizer = AutoTokenizer.from_pretrained(
- base_path,
- cache_dir=base_path,
- local_files_only=True,
- )
- assert self.tokenizer.is_fast # assume tokenizer.json is used
-
- # Initialize lists and dictionaries for added tokens
- self.added_tokens_list = []
- self.added_tokens_dict = dict()
- self.added_tokens_ids = set()
-
- # Process added tokens
- for tok, tokidx in sorted(
- self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
- ):
- # Only consider added tokens that are not in the base vocabulary
- if tokidx >= self.tokenizer.vocab_size:
- self.added_tokens_list.append(tok)
- self.added_tokens_dict[tok] = tokidx
- self.added_tokens_ids.add(tokidx)
-
- # Store special tokens and their IDs
- self.specials = {
- tok: self.tokenizer.get_vocab()[tok]
- for tok in self.tokenizer.all_special_tokens
- }
- self.special_ids = set(self.tokenizer.all_special_ids)
-
- # Set vocabulary sizes
- self.vocab_size_base = self.tokenizer.vocab_size
- self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
-
- self.fname_tokenizer = fname_tokenizer
-
- def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- reverse_vocab = {
- id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
- }
-
- for token_id in range(self.vocab_size_base):
- # Skip processing added tokens here
- if token_id in self.added_tokens_ids:
- continue
-
- # Convert token text to bytes
- token_text = reverse_vocab[token_id].encode("utf-8")
-
- # Yield token text, score, and type
- yield token_text, self.get_token_score(token_id), self.get_token_type(
- token_id, token_text, self.special_ids # Reuse already stored special IDs
- )
-
- def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
- # Special case for byte tokens
- if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
- return gguf.TokenType.BYTE
-
- # Determine token type based on whether it's a special token
- return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
-
- def get_token_score(self, token_id: int) -> float:
- # Placeholder for actual logic to determine the token's score
- # This needs to be implemented based on specific requirements
- return -1000.0 # Default score
-
- def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- for text in self.added_tokens_list:
- if text in self.specials:
- toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
- score = self.get_token_score(self.specials[text])
- else:
- toktype = gguf.TokenType.USER_DEFINED
- score = -1000.0
-
- yield text.encode("utf-8"), score, toktype
-
- def has_newline_token(self):
- return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
-
- def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- yield from self.hf_tokens()
- yield from self.added_tokens()
-
- def __repr__(self) -> str:
- return f""
-
-
-#
-# data loading
-# TODO: reuse (probably move to gguf.py?)
-#
-
-
-def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
- # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
- if n_head_kv is not None and n_head != n_head_kv:
- n_head = n_head_kv
- return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
- .swapaxes(1, 2)
- .reshape(weights.shape))
-
-
-class Tensor(ABC):
- ndarray: NDArray
- data_type: DataType
-
- @abstractmethod
- def astype(self, data_type: DataType) -> Self: ...
- @abstractmethod
- def permute(self, n_head: int, n_head_kv: int) -> Self: ...
- @abstractmethod
- def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ...
- @abstractmethod
- def part(self, n_part: int) -> Self: ...
- @abstractmethod
- def to_ggml(self) -> GGMLCompatibleTensor: ...
-
-
-def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
- assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
- fp32_arr = bf16_arr.astype(np.uint32) << 16
- return fp32_arr.view(np.float32)
-
-
-class UnquantizedTensor(Tensor):
- def __init__(self, ndarray: NDArray):
- assert isinstance(ndarray, np.ndarray)
- self.ndarray = ndarray
- self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype]
-
- def astype(self, data_type: DataType) -> UnquantizedTensor:
- dtype = data_type.dtype
- if self.data_type == DT_BF16:
- self.ndarray = bf16_to_fp32(self.ndarray)
- return UnquantizedTensor(self.ndarray.astype(dtype))
-
- def to_ggml(self) -> Self:
- return self
-
- def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
- r = self.ndarray.shape[0] // 3
- return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
-
- def part(self, n_part: int) -> UnquantizedTensor:
- r = self.ndarray.shape[0] // 3
- return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
-
- def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor:
- return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))
-
-
-def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
- tensor = lazy_tensor.load()
- assert isinstance(tensor, UnquantizedTensor)
-
- # double-check:
- actual_shape = list(tensor.ndarray.shape)
- assert actual_shape == lazy_tensor.shape, (actual_shape, lazy_tensor.shape)
- if expected_dtype is not None and expected_dtype != tensor.ndarray.dtype:
- if convert:
- tensor.ndarray = tensor.ndarray.astype(expected_dtype)
- else:
- raise ValueError(f'expected this tensor to have dtype {expected_dtype}, got {tensor.ndarray.dtype}')
-
- return tensor.ndarray
-
-
-GGMLCompatibleTensor = UnquantizedTensor
-
-
-@dataclass
-class LazyTensor:
- _load: Callable[[], Tensor]
- shape: list[int]
- data_type: DataType
- description: str
-
- def load(self) -> Tensor:
- ret = self._load()
- # Should be okay if it maps to the same numpy type?
- assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
- (self.data_type, ret.data_type, self.description)
- return ret
-
- def astype(self, data_type: DataType) -> LazyTensor:
- self.validate_conversion_to(data_type)
-
- def load() -> Tensor:
- return self.load().astype(data_type)
- return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
-
- def validate_conversion_to(self, data_type: DataType) -> None:
- if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
- raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')
-
-
-LazyModel: TypeAlias = 'dict[str, LazyTensor]'
-
-
-@dataclass
-class ModelPlus:
- model: LazyModel
- paths: list[Path] # Where this was read from.
- format: Literal['ggml', 'torch', 'safetensors', 'none']
- vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab.
-
-
-def merge_sharded(models: list[LazyModel]) -> LazyModel:
- # Original LLaMA models have each file contain one part of each tensor.
- # Use a dict instead of a set to preserve order.
- names = {name: None for model in models for name in model}
-
- def convert(name: str) -> LazyTensor:
- lazy_tensors = [model[name] for model in models]
- if len(lazy_tensors) == 1:
- # only one file; don't go through this procedure since there might
- # be quantized tensors
- return lazy_tensors[0]
- if len(lazy_tensors[0].shape) == 1:
- # the tensor is just duplicated in every file
- return lazy_tensors[0]
- if name.startswith('tok_embeddings.') or \
- name.endswith('.attention.wo.weight') or \
- name.endswith('.feed_forward.w2.weight'):
- # split by columns
- axis = 1
- else:
- # split by rows
- axis = 0
- concatenated_shape = list(lazy_tensors[0].shape)
- concatenated_shape[axis] = sum(tensor.shape[axis] for tensor in lazy_tensors)
-
- def load() -> UnquantizedTensor:
- ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors]
- concatenated = np.concatenate(ndarrays, axis=axis)
- return UnquantizedTensor(concatenated)
- description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]'
- return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description)
- return {name: convert(name) for name in names}
-
-
-def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
- formats = set(mp.format for mp in models_plus)
- assert len(formats) == 1, "different formats?"
- format = formats.pop()
- paths = [path for mp in models_plus for path in mp.paths]
- # Use the first non-None vocab, if any.
- try:
- vocab = next(mp.vocab for mp in models_plus if mp.vocab is not None)
- except StopIteration:
- vocab = None
-
- if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
- # Transformers models put different tensors in different files, but
- # don't split individual tensors between files.
- model: LazyModel = {}
- for mp in models_plus:
- model.update(mp.model)
- else:
- model = merge_sharded([mp.model for mp in models_plus])
-
- return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types
-
-
-def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
- def load() -> Tensor:
- return lazy_tensor.load().permute(n_head, n_head_kv)
- return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
-
-
-def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
- def load() -> Tensor:
- return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
- s = lazy_tensor.shape.copy()
- s[0] = s[0] // 3
- return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
-
-
-def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
- def load() -> Tensor:
- return lazy_tensor.load().part(n_part)
- s = lazy_tensor.shape.copy()
- s[0] = s[0] // 3
- return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
-
-
-def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
- def load() -> Tensor:
- tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
- return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
- s = lazy_tensors[0].shape.copy()
- s.insert(0, len(lazy_tensors))
- return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
-
-
-# Functionality that simulates `torch.load` but where individual tensors are
-# only loaded into memory on demand, not all at once.
-# PyTorch can't do this natively as of time of writing:
-# - https://github.com/pytorch/pytorch/issues/64327
-# This allows us to de-shard without multiplying RAM usage, and also
-# conveniently drops the PyTorch dependency (though we still need numpy).
-
-
-@dataclass
-class LazyStorageKind:
- data_type: DataType
-
-
-@dataclass
-class LazyStorage:
- load: Callable[[int, int], NDArray]
- kind: LazyStorageKind
- description: str
-
-
-class LazyUnpickler(pickle.Unpickler):
- def __init__(self, fp: IO[bytes], data_base_path: str, zip_file: zipfile.ZipFile):
- super().__init__(fp)
- self.data_base_path = data_base_path
- self.zip_file = zip_file
-
- def persistent_load(self, pid: Any) -> Any:
- assert pid[0] == 'storage'
- assert isinstance(pid[1], LazyStorageKind)
- data_type = pid[1].data_type
- filename_stem = pid[2]
- filename = f'{self.data_base_path}/{filename_stem}'
- info = self.zip_file.getinfo(filename)
-
- def load(offset: int, elm_count: int) -> NDArray:
- dtype = data_type.dtype
- with self.zip_file.open(info) as fp:
- fp.seek(offset * dtype.itemsize)
- size = elm_count * dtype.itemsize
- data = fp.read(size)
- assert len(data) == size
- return np.frombuffer(data, dtype)
- description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
- return LazyStorage(load=load, kind=pid[1], description=description)
-
- @staticmethod
- def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
- requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
- assert isinstance(storage, LazyStorage)
-
- def load() -> UnquantizedTensor:
- elm_count = stride[0] * size[0]
- return UnquantizedTensor(storage.load(storage_offset, elm_count).reshape(size))
- description = f'pickled storage_offset={storage_offset} in {storage.description}'
- return LazyTensor(load, list(size), storage.kind.data_type, description)
-
- @staticmethod
- def rebuild_from_type_v2(func, new_type, args, state):
- return func(*args)
-
- CLASSES = {
- # getattr used here as a workaround for mypy not being smart enough to determine
- # the staticmethods have a __func__ attribute.
- ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
- ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
- ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
- ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
- ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
- ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
- ('torch', 'Tensor'): LazyTensor,
- }
-
- def find_class(self, module: str, name: str) -> Any:
- if not module.startswith('torch'):
- return super().find_class(module, name)
- return self.CLASSES[(module, name)]
-
-
-def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
- zf = zipfile.ZipFile(outer_fp)
- pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
- assert len(pickle_paths) == 1, pickle_paths
- pickle_fp = zf.open(pickle_paths[0], 'r')
- unpickler = LazyUnpickler(pickle_fp,
- data_base_path=pickle_paths[0][:-4],
- zip_file=zf)
- model = unpickler.load()
- if 'model' in model: model = model['model']
- as_dict = dict(model.items())
- return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
-
-
-def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
- header_size, = struct.unpack(' LazyTensor:
- data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
- numpy_dtype = data_type.dtype
- shape: list[int] = info['shape']
- begin, end = info['data_offsets']
- assert 0 <= begin <= end <= len(byte_buf)
- assert end - begin == math.prod(shape) * numpy_dtype.itemsize
- buf = byte_buf[begin:end]
-
- def load() -> UnquantizedTensor:
- return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
- description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
- return LazyTensor(load, shape, data_type, description)
- model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
- return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)
-
-
-def must_read(fp: IO[bytes], length: int) -> bytes:
- ret = fp.read(length)
- if len(ret) < length:
- raise EOFError("unexpectedly reached end of file")
- return ret
-
-
-@functools.lru_cache(maxsize=None)
-def lazy_load_file(path: Path) -> ModelPlus:
- fp = open(path, 'rb')
- first8 = fp.read(8)
- fp.seek(0)
- if first8[:2] == b'PK':
- # A zip file, i.e. PyTorch format
- return lazy_load_torch_file(fp, path)
- elif struct.unpack(' Iterable[Out]:
- '''Parallel map, but with backpressure. If the caller doesn't call `next`
- fast enough, this will stop calling `func` at some point rather than
- letting results pile up in memory. Specifically, there is a max of one
- output value buffered per thread.'''
- if concurrency < 2:
- yield from map(func, iterable)
- # Not reached.
- iterable = iter(iterable)
- executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
- if use_processpool_executor:
- executor_class = ProcessPoolExecutor
- else:
- executor_class = ThreadPoolExecutor
- with executor_class(max_workers=max_workers) as executor:
- futures: list[concurrent.futures.Future[Out]] = []
- done = False
- for _ in range(concurrency):
- try:
- futures.append(executor.submit(func, next(iterable)))
- except StopIteration:
- done = True
- break
-
- while futures:
- result = futures.pop(0).result()
- while not done and len(futures) < concurrency:
- try:
- futures.append(executor.submit(func, next(iterable)))
- except StopIteration:
- done = True
- break
- yield result
-
-
-def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None:
- # Handle special case where the model's vocab size is not set
- if params.n_vocab == -1:
- raise ValueError(
- "The model's vocab size is set to -1 in params.json. Please update it manually."
- + (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""),
- )
- if not isinstance(vocab, Vocab):
- return # model has no vocab
-
- # Check for a vocab size mismatch
- if params.n_vocab == vocab.vocab_size:
- print("Ignoring added_tokens.json since model matches vocab size without it.")
- return
-
- if pad_vocab and params.n_vocab > vocab.vocab_size:
- pad_count = params.n_vocab - vocab.vocab_size
- print(
- f"Padding vocab with {pad_count} token(s) - through "
- )
- for i in range(1, pad_count + 1):
- vocab.added_tokens_dict[f""] = -1
- vocab.added_tokens_list.append(f"")
- vocab.vocab_size = params.n_vocab
- return
-
- msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer} has {vocab.vocab_size})."
- if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
- msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
- if vocab.vocab_size < params.n_vocab:
- msg += " Add the --pad-vocab option and try again."
-
- raise ValueError(msg)
-
-
-class OutputFile:
- def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
- self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
-
- def add_meta_arch(self, params: Params) -> None:
- name = "LLaMA"
-
- # TODO: better logic to determine model name
- if params.n_ctx == 4096:
- name = "LLaMA v2"
- elif params.path_model is not None:
- name = str(params.path_model.parent).split('/')[-1]
-
- self.gguf.add_name (name)
- self.gguf.add_vocab_size (params.n_vocab)
- self.gguf.add_context_length (params.n_ctx)
- self.gguf.add_embedding_length (params.n_embd)
- self.gguf.add_block_count (params.n_layer)
- self.gguf.add_feed_forward_length (params.n_ff)
- self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
- self.gguf.add_head_count (params.n_head)
- self.gguf.add_head_count_kv (params.n_head_kv)
-
- if params.n_experts:
- self.gguf.add_expert_count(params.n_experts)
-
- if params.n_experts_used:
- self.gguf.add_expert_used_count(params.n_experts_used)
-
- if params.f_norm_eps:
- self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
- else:
- raise ValueError('f_norm_eps is None')
-
- if params.f_rope_freq_base is not None:
- self.gguf.add_rope_freq_base(params.f_rope_freq_base)
-
- if params.rope_scaling_type:
- assert params.f_rope_scale is not None
- self.gguf.add_rope_scaling_type(params.rope_scaling_type)
- self.gguf.add_rope_scaling_factor(params.f_rope_scale)
-
- if params.n_orig_ctx is not None:
- self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
-
- if params.rope_finetuned is not None:
- self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
-
- if params.ftype is not None:
- self.gguf.add_file_type(params.ftype)
-
- def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
- tokens = []
- scores = []
- toktypes = []
-
- # NOTE: `all_tokens` returns the base vocabulary and added tokens
- for text, score, toktype in vocab.all_tokens():
- tokens.append(text)
- scores.append(score)
- toktypes.append(toktype)
-
- assert len(tokens) == vocab.vocab_size
-
- return tokens, scores, toktypes
-
- def add_meta_vocab(self, vocab: Vocab) -> None:
- # Ensure that tokenizer_model is added to the GGUF model
- self.gguf.add_tokenizer_model(vocab.tokenizer_model)
-
- # Extract model vocabulary for model conversion
- tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
-
- # Add extracted token information for model conversion
- self.gguf.add_token_list(tokens)
- self.gguf.add_token_scores(scores)
- self.gguf.add_token_types(toktypes)
-
- def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
- svocab.add_to_gguf(self.gguf)
-
- def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
- n_elements = int(np.prod(tensor.shape))
- raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
- data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
- data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
- self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
-
- def write_meta(self) -> None:
- self.gguf.write_header_to_file()
- self.gguf.write_kv_data_to_file()
-
- def write_tensor_info(self) -> None:
- self.gguf.write_ti_data_to_file()
-
- def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
- ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
- if ftype == GGMLFileType.MostlyQ8_0:
- ndarrays = bounded_parallel_map(
- OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
- use_processpool_executor=True,
- )
- else:
- ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
-
- start = time.time()
- for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
- elapsed = time.time() - start
- size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
- padi = len(str(len(model)))
- print(
- f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
- )
- self.gguf.write_tensor_data(ndarray)
-
- def close(self) -> None:
- self.gguf.close()
-
- @staticmethod
- def write_vocab_only(
- fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
- endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
- ) -> None:
- check_vocab_size(params, vocab, pad_vocab=pad_vocab)
-
- of = OutputFile(fname_out, endianess=endianess)
-
- # meta data
- of.add_meta_arch(params)
- of.add_meta_vocab(vocab)
- of.add_meta_special_vocab(svocab)
-
- of.write_meta()
-
- of.close()
-
- @staticmethod
- def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]:
- name, lazy_tensor = item
- tensor = lazy_tensor.load().to_ggml()
- return (lazy_tensor.data_type, tensor.ndarray)
-
- @staticmethod
- def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
- dt, arr = item
- if not isinstance(dt, QuantizedDataType):
- return arr
- return dt.quantize(arr)
-
- @staticmethod
- def write_all(
- fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
- concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
- pad_vocab: bool = False,
- ) -> None:
- check_vocab_size(params, vocab, pad_vocab=pad_vocab)
-
- of = OutputFile(fname_out, endianess=endianess)
-
- # meta data
- of.add_meta_arch(params)
- if isinstance(vocab, Vocab):
- of.add_meta_vocab(vocab)
- of.add_meta_special_vocab(svocab)
- else: # NoVocab
- of.gguf.add_tokenizer_model(vocab.tokenizer_model)
-
- # tensor info
- for name, lazy_tensor in model.items():
- of.add_tensor_info(name, lazy_tensor)
-
- of.write_meta()
- of.write_tensor_info()
-
- # tensor data
- of.write_tensor_data(ftype, model, concurrency)
-
- of.close()
-
-
-def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
- wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
-
- if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
- return GGMLFileType.AllF32
- if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
- return GGMLFileType.MostlyF16
- if output_type_str == "q8_0":
- return GGMLFileType.MostlyQ8_0
-
- name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()}
-
- raise ValueError(f"Unexpected combination of types: {name_to_type}")
-
-
-def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
- return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
- for (name, tensor) in model.items()}
-
-
-def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
- tmap = gguf.TensorNameMap(ARCH, params.n_layer)
- should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
-
- tmp = model
-
- # merge experts into one tensor
- if params.n_experts and params.n_experts > 0:
- for i_l in range(params.n_layer):
- for w in range(1, 4):
- experts = []
- for e in range(params.n_experts):
- if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
- experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
- del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
- elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
- experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
- del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
- else:
- raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
- tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
-
- # HF models permut or pack some of the tensors, so we need to undo that
- for i in itertools.count():
- if f"model.layers.{i}.self_attn.q_proj.weight" in model:
- print(f"Permuting layer {i}")
- tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
- tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
- # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
- elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
- print(f"Unpacking and permuting layer {i}")
- tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
- tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
- tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
- del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
- else:
- break
-
- out: LazyModel = {}
- for name, lazy_tensor in model.items():
- tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
- if name_new is None:
- if skip_unknown:
- print(f"Unexpected tensor name: {name} - skipping")
- continue
- raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
-
- if tensor_type in should_skip:
- print(f"skipping tensor {name_new}")
- continue
-
- print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
- out[name_new] = lazy_tensor
-
- return out
-
-
-def nth_multifile_path(path: Path, n: int) -> Path | None:
- '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
- the nth path in the model.
- '''
- # Support the following patterns:
- patterns = [
- # - x.00.pth, x.01.pth, etc.
- (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
- # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
- (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
- # x.bin, x.bin.1, etc.
- (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
- ]
- for regex, replacement in patterns:
- if re.search(regex, path.name):
- new_path = path.with_name(re.sub(regex, replacement, path.name))
- if new_path.exists():
- return new_path
- return None
-
-
-def find_multifile_paths(path: Path) -> list[Path]:
- '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
- the whole list of paths in the model.
- '''
- ret: list[Path] = []
- for i in itertools.count():
- nth_path = nth_multifile_path(path, i)
- if nth_path is None:
- break
- ret.append(nth_path)
- if not ret:
- # No matches. This should only happen if the file was named, e.g.,
- # foo.0, and there was no file named foo. Oh well, try to process it
- # as a single file.
- return [path]
- return ret
-
-
-def load_some_model(path: Path) -> ModelPlus:
- '''Load a model of any supported format.'''
- # Be extra-friendly and accept either a file or a directory:
- if path.is_dir():
- # Check if it's a set of safetensors files first
- globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"]
- files = [file for glob in globs for file in path.glob(glob)]
- if not files:
- # Try the PyTorch patterns too, with lower priority
- globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
- files = [file for glob in globs for file in path.glob(glob)]
- if not files:
- raise FileNotFoundError(f"Can't find model in directory {path}")
- if len(files) > 1:
- raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}")
- path = files[0]
-
- paths = find_multifile_paths(path)
- models_plus: list[ModelPlus] = []
- for path in paths:
- print(f"Loading model file {path}")
- models_plus.append(lazy_load_file(path))
-
- model_plus = merge_multifile_models(models_plus)
- return model_plus
-
-
-class VocabFactory:
- _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab]
-
- def __init__(self, path: Path):
- self.path = path
-
- def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab:
- load_merges = vocab.name == "bpe"
- n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None
- return gguf.SpecialVocab(
- model_parent_path,
- load_merges=load_merges,
- special_token_types=None, # Predetermined or passed as a parameter
- n_vocab=n_vocab,
- )
-
- def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
- vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES}
- selected_vocabs: dict[str, type[Vocab]] = {}
- for vtype in vocab_types:
- try:
- selected_vocabs[vtype] = vocab_classes[vtype]
- except KeyError:
- raise ValueError(f"Unsupported vocabulary type {vtype}") from None
-
- for vtype, cls in selected_vocabs.items():
- try:
- vocab = cls(self.path)
- break
- except FileNotFoundError:
- pass # ignore unavailable tokenizers
- else:
- raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
-
- print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
- return vocab
-
- def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
- vocab: BaseVocab
- if vocab_types is None:
- vocab = NoVocab()
- else:
- vocab = self._create_vocab_by_path(vocab_types)
- # FIXME: Respect --vocab-dir?
- special_vocab = self._create_special_vocab(
- vocab,
- model_parent_path,
- )
- return vocab, special_vocab
-
-
-def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
- namestr = {
- GGMLFileType.AllF32: "f32",
- GGMLFileType.MostlyF16: "f16",
- GGMLFileType.MostlyQ8_0:"q8_0",
- }[file_type]
- ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
- if ret in model_paths:
- sys.stderr.write(
- f"Error: Default output path ({ret}) would overwrite the input. "
- "Please explicitly specify a path using --outfile.\n")
- sys.exit(1)
- return ret
-
-
-def do_dump_model(model_plus: ModelPlus) -> None:
- print(f"model_plus.paths = {model_plus.paths!r}")
- print(f"model_plus.format = {model_plus.format!r}")
- print(f"model_plus.vocab = {model_plus.vocab!r}")
- for name, lazy_tensor in model_plus.model.items():
- print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
-
-
-def main(args_in: list[str] | None = None) -> None:
- output_choices = ["f32", "f16"]
- if np.uint32(1) == np.uint32(1).newbyteorder("<"):
- # We currently only support Q8_0 output on little endian systems.
- output_choices.append("q8_0")
- parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file")
- parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
- parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
- parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
- parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab")
- parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
- parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
- parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
- parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
- parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
- parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
- parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
- parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
- parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
- parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
-
- args = parser.parse_args(args_in)
- if args.no_vocab and args.vocab_only:
- raise ValueError("--vocab-only does not make sense with --no-vocab")
-
- if args.dump_single:
- model_plus = lazy_load_file(args.model)
- do_dump_model(model_plus)
- return
-
- if not args.vocab_only:
- model_plus = load_some_model(args.model)
- else:
- model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
-
- if args.dump:
- do_dump_model(model_plus)
- return
- endianess = gguf.GGUFEndian.LITTLE
- if args.big_endian:
- endianess = gguf.GGUFEndian.BIG
-
- params = Params.load(model_plus)
- if params.n_ctx == -1:
- if args.ctx is None:
- msg = """\
- The model doesn't have a context size, and you didn't specify one with --ctx
- Please specify one with --ctx:
- - LLaMA v1: --ctx 2048
- - LLaMA v2: --ctx 4096"""
- parser.error(textwrap.dedent(msg))
- params.n_ctx = args.ctx
-
- if args.outtype:
- params.ftype = {
- "f32": GGMLFileType.AllF32,
- "f16": GGMLFileType.MostlyF16,
- "q8_0": GGMLFileType.MostlyQ8_0,
- }[args.outtype]
-
- print(f"params = {params}")
-
- model_parent_path = model_plus.paths[0].parent
- vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
- vocab_factory = VocabFactory(vocab_path)
- vocab_types = None if args.no_vocab else args.vocab_type.split(",")
- vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path)
-
- if args.vocab_only:
- assert isinstance(vocab, Vocab)
- if not args.outfile:
- raise ValueError("need --outfile if using --vocab-only")
- outfile = args.outfile
- OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
- endianess=endianess, pad_vocab=args.pad_vocab)
- print(f"Wrote {outfile}")
- return
-
- if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
- vocab = model_plus.vocab
-
- print(f"Vocab info: {vocab}")
- print(f"Special vocab info: {special_vocab}")
-
- model = model_plus.model
- model = convert_model_names(model, params, args.skip_unknown)
- ftype = pick_output_type(model, args.outtype)
- model = convert_to_output_type(model, ftype)
- outfile = args.outfile or default_outfile(model_plus.paths, ftype)
-
- params.ftype = ftype
- print(f"Writing {outfile}, format {ftype}")
-
- OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
- concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
- print(f"Wrote {outfile}")
-
-
-if __name__ == '__main__':
- main()
diff --git a/ggml-alloc.c b/ggml-alloc.c
deleted file mode 100644
index 1fbd376edf4..00000000000
--- a/ggml-alloc.c
+++ /dev/null
@@ -1,985 +0,0 @@
-#include "ggml-alloc.h"
-#include "ggml-backend-impl.h"
-#include "ggml.h"
-#include "ggml-impl.h"
-#include
-#include
-#include
-#include
-#include
-#include
-
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-#define MAX_FREE_BLOCKS 256
-
-//#define GGML_ALLOCATOR_DEBUG
-
-//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
-#define AT_PRINTF(...)
-
-
-static bool ggml_is_view(const struct ggml_tensor * t) {
- return t->view_src != NULL;
-}
-
-static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
- if (a->type != b->type) {
- return false;
- }
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
- if (a->ne[i] != b->ne[i]) {
- return false;
- }
- if (a->nb[i] != b->nb[i]) {
- return false;
- }
- }
- return true;
-}
-
-static bool ggml_op_can_inplace(enum ggml_op op) {
- switch (op) {
- case GGML_OP_SCALE:
- case GGML_OP_DIAG_MASK_ZERO:
- case GGML_OP_DIAG_MASK_INF:
- case GGML_OP_ADD:
- case GGML_OP_ADD1:
- case GGML_OP_SUB:
- case GGML_OP_MUL:
- case GGML_OP_DIV:
- case GGML_OP_SQR:
- case GGML_OP_SQRT:
- case GGML_OP_LOG:
- case GGML_OP_UNARY:
- case GGML_OP_ROPE:
- case GGML_OP_RMS_NORM:
- case GGML_OP_SOFT_MAX:
- return true;
-
- default:
- return false;
- }
-}
-
-static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
- assert(alignment && !(alignment & (alignment - 1))); // power of 2
- size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
- return offset + align;
-}
-
-// tallocr
-
-struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer) {
- void * base = ggml_backend_buffer_get_base(buffer);
- size_t align = ggml_backend_buffer_get_alignment(buffer);
-
- assert(align && !(align & (align - 1))); // power of 2
-
- struct ggml_tallocr talloc = (struct ggml_tallocr) {
- /*.buffer = */ buffer,
- /*.base = */ base,
- /*.alignment = */ align,
- /*.offset = */ aligned_offset(base, 0, align),
- };
- return talloc;
-}
-
-void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor) {
- size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
- size = GGML_PAD(size, talloc->alignment);
-
- if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
- __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
- GGML_ASSERT(!"not enough space in the buffer");
- return;
- }
-
- void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
- talloc->offset += size;
-
- assert(((uintptr_t)addr % talloc->alignment) == 0);
-
- ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
-}
-
-// dynamic tensor allocator
-
-struct free_block {
- size_t offset;
- size_t size;
-};
-
-struct ggml_dyn_tallocr {
- size_t alignment;
- int n_free_blocks;
- struct free_block free_blocks[MAX_FREE_BLOCKS];
- size_t max_size;
-
-#ifdef GGML_ALLOCATOR_DEBUG
- struct {
- const struct ggml_tensor * tensor;
- size_t offset;
- } allocated_tensors[1024];
-#endif
-};
-
-#ifdef GGML_ALLOCATOR_DEBUG
-static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
- for (int i = 0; i < 1024; i++) {
- if (alloc->allocated_tensors[i].tensor == NULL) {
- alloc->allocated_tensors[i].tensor = tensor;
- alloc->allocated_tensors[i].offset = offset;
- return;
- }
- }
- GGML_ASSERT(!"out of allocated_tensors");
-}
-static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
- for (int i = 0; i < 1024; i++) {
- if (alloc->allocated_tensors[i].offset == offset) {
- alloc->allocated_tensors[i].tensor = NULL;
- return;
- }
- }
- fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
- GGML_ASSERT(!"tensor not found");
-}
-#endif
-
-static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
- size = aligned_offset(NULL, size, alloc->alignment);
-
- AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
-
- size_t max_avail = 0;
-
- // find the best fitting free block besides the last block
- int best_fit_block = -1;
- size_t best_fit_size = SIZE_MAX;
- for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
- struct free_block * block = &alloc->free_blocks[i];
- max_avail = MAX(max_avail, block->size);
- if (block->size >= size && block->size <= best_fit_size) {
- best_fit_block = i;
- best_fit_size = block->size;
- }
- }
-
- if (best_fit_block == -1) {
- // the last block is our last resort
- struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
- max_avail = MAX(max_avail, block->size);
- if (block->size >= size) {
- best_fit_block = alloc->n_free_blocks - 1;
- } else {
- // this should never happen
- fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
- __func__, size, max_avail);
- GGML_ASSERT(!"not enough space in the buffer");
- GGML_UNREACHABLE();
- }
- }
-
- struct free_block * block = &alloc->free_blocks[best_fit_block];
- size_t offset = block->offset;
- block->offset = offset + size;
- block->size -= size;
- if (block->size == 0) {
- // remove block if empty
- alloc->n_free_blocks--;
- for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
- alloc->free_blocks[j] = alloc->free_blocks[j+1];
- }
- }
-
- AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
-
-#ifdef GGML_ALLOCATOR_DEBUG
- add_allocated_tensor(alloc, offset, tensor);
- size_t cur_max = offset + size;
- if (cur_max > alloc->max_size) {
- // sort allocated_tensors by offset
- for (int i = 0; i < 1024; i++) {
- for (int j = i + 1; j < 1024; j++) {
- if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
- const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
- size_t tmp_offset = alloc->allocated_tensors[i].offset;
- alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
- alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
- alloc->allocated_tensors[j].tensor = tmp_tensor;
- alloc->allocated_tensors[j].offset = tmp_offset;
- }
- }
- }
- fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
- for (int i = 0; i < 1024; i++) {
- if (alloc->allocated_tensors[i].tensor) {
- fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
- alloc->allocated_tensors[i].offset,
- alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
- ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
- }
- }
- fprintf(stderr, "\n");
- }
-#endif
-
- alloc->max_size = MAX(alloc->max_size, offset + size);
-
- return offset;
-
- GGML_UNUSED(tensor);
-}
-
-// this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
- size = aligned_offset(NULL, size, alloc->alignment);
-
- AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
-
-#ifdef GGML_ALLOCATOR_DEBUG
- remove_allocated_tensor(alloc, offset, tensor);
-#endif
-
- // see if we can merge with an existing block
- for (int i = 0; i < alloc->n_free_blocks; i++) {
- struct free_block * block = &alloc->free_blocks[i];
- // check if ptr is at the end of the block
- if (block->offset + block->size == offset) {
- block->size += size;
- // check if we can merge with the next block
- if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
- block->size += alloc->free_blocks[i+1].size;
- alloc->n_free_blocks--;
- for (int j = i+1; j < alloc->n_free_blocks; j++) {
- alloc->free_blocks[j] = alloc->free_blocks[j+1];
- }
- }
- return;
- }
- // check if ptr is at the beginning of the block
- if (offset + size == block->offset) {
- block->offset = offset;
- block->size += size;
- // check if we can merge with the previous block
- if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
- alloc->free_blocks[i-1].size += block->size;
- alloc->n_free_blocks--;
- for (int j = i; j < alloc->n_free_blocks; j++) {
- alloc->free_blocks[j] = alloc->free_blocks[j+1];
- }
- }
- return;
- }
- }
- // otherwise, add a new block
- GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
- // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
- int insert_pos = 0;
- while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
- insert_pos++;
- }
- // shift all blocks from insert_pos onward to make room for the new block
- for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
- alloc->free_blocks[i] = alloc->free_blocks[i-1];
- }
- // insert the new block
- alloc->free_blocks[insert_pos].offset = offset;
- alloc->free_blocks[insert_pos].size = size;
- alloc->n_free_blocks++;
-
- GGML_UNUSED(tensor);
-}
-
-static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
- alloc->n_free_blocks = 1;
- alloc->free_blocks[0].offset = 0;
- alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
- alloc->max_size = 0;
-}
-
-static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
- struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
-
- *alloc = (struct ggml_dyn_tallocr) {
- /*.alignment = */ alignment,
- /*.n_free_blocks = */ 0,
- /*.free_blocks = */ {{0}},
- /*.max_size = */ 0,
-#ifdef GGML_ALLOCATOR_DEBUG
- /*.allocated_tensors = */ {{0}},
-#endif
- };
-
- ggml_dyn_tallocr_reset(alloc);
-
- return alloc;
-}
-
-static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
- free(alloc);
-}
-
-static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
- return alloc->max_size;
-}
-
-
-/////////////////////////////////////
-
-// graph allocator
-
-struct hash_node {
- int n_children;
- int n_views;
- int buffer_id;
- size_t offset; // offset within the buffer
- bool allocated;
-};
-
-struct tensor_alloc {
- size_t offset;
- size_t size_max; // 0 = pre-allocated, unused, or view
-};
-
-struct leaf_alloc {
- int buffer_id;
- struct tensor_alloc leaf;
-};
-
-struct node_alloc {
- int buffer_id;
- struct tensor_alloc dst;
- struct tensor_alloc src[GGML_MAX_SRC];
-};
-
-struct ggml_gallocr {
- ggml_backend_buffer_type_t * bufts; // [n_buffers]
- ggml_backend_buffer_t * buffers; // [n_buffers]
- struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
- int n_buffers;
-
- struct ggml_hash_set hash_set;
- struct hash_node * hash_values; // [hash_set.size]
-
- struct node_alloc * node_allocs; // [n_nodes]
- int n_nodes;
-
- struct leaf_alloc * leaf_allocs; // [n_leafs]
- int n_leafs;
-};
-
-ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
- ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr));
- GGML_ASSERT(galloc != NULL);
-
- galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
- GGML_ASSERT(galloc->bufts != NULL);
-
- galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs);
- GGML_ASSERT(galloc->buffers != NULL);
-
- galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
- GGML_ASSERT(galloc->buf_tallocs != NULL);
-
- for (int i = 0; i < n_bufs; i++) {
- galloc->bufts[i] = bufts[i];
- galloc->buffers[i] = NULL;
- size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
- galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
- }
- galloc->n_buffers = n_bufs;
-
- return galloc;
-}
-
-ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
- return ggml_gallocr_new_n(&buft, 1);
-}
-
-void ggml_gallocr_free(ggml_gallocr_t galloc) {
- if (galloc == NULL) {
- return;
- }
-
- for (int i = 0; i < galloc->n_buffers; i++) {
- if (galloc->buffers != NULL) {
- ggml_backend_buffer_free(galloc->buffers[i]);
- }
- if (galloc->buf_tallocs != NULL) {
- ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
- }
- }
-
- free(galloc->hash_set.keys);
- free(galloc->hash_values);
- free(galloc->bufts);
- free(galloc->buffers);
- free(galloc->buf_tallocs);
- free(galloc->node_allocs);
- free(galloc->leaf_allocs);
- free(galloc);
-}
-
-typedef struct ggml_gallocr * ggml_gallocr_t;
-
-static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
- size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
- return &galloc->hash_values[i];
-}
-
-static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
- return ggml_gallocr_hash_get(galloc, t)->allocated;
-}
-
-static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
- hn->buffer_id = buffer_id;
- hn->offset = offset;
- hn->allocated = true;
-}
-
-static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
- return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
-}
-
-static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
-
- if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
- hn->allocated = true;
- assert(hn->offset == 0);
-
- // try to reuse a parent's buffer (inplace)
- if (ggml_op_can_inplace(node->op)) {
- for (int i = 0; i < GGML_MAX_SRC; i++) {
- struct ggml_tensor * parent = node->src[i];
- if (parent == NULL) {
- continue;
- }
-
- // if the node's data is external, then we cannot re-use it
- if (!ggml_gallocr_is_own(galloc, parent)) {
- AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
- continue;
- }
-
- // outputs cannot be reused
- if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
- AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
- continue;
- }
-
- if (!ggml_are_same_layout(node, parent)) {
- AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
- continue;
- }
-
- struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
- if (p_hn->n_children == 1 && p_hn->n_views == 0) {
- if (ggml_is_view(parent)) {
- struct ggml_tensor * view_src = parent->view_src;
- struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
- if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
- AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
- assert(view_src_hn->offset == p_hn->offset);
- hn->buffer_id = p_hn->buffer_id;
- hn->offset = p_hn->offset;
- p_hn->allocated = false; // avoid freeing the parent
- view_src_hn->allocated = false;
- return;
- }
- } else {
- AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
- hn->buffer_id = p_hn->buffer_id;
- hn->offset = p_hn->offset;
- p_hn->allocated = false; // avoid freeing the parent
- return;
- }
- }
- }
- }
- // allocate tensor from the buffer
- struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
- ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
- size_t size = ggml_backend_buft_get_alloc_size(buft, node);
- size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
- hn->buffer_id = buffer_id;
- hn->offset = offset;
- return;
- }
-}
-
-static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
- // graph outputs are never freed
- if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
- AT_PRINTF("not freeing output %s\n", node->name);
- return;
- }
-
- struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
- ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
- size_t offset = hn->offset;
- size_t size = ggml_backend_buft_get_alloc_size(buft, node);
- ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
- hn->allocated = false;
-}
-
-static int get_node_buffer_id(const int * node_buffer_ids, int i) {
- return node_buffer_ids ? node_buffer_ids[i] : 0;
-}
-
-static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
- // clear hash tables
- memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
- memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
-
- // allocate leafs
- // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
- for (int i = 0; i < graph->n_leafs; i++) {
- struct ggml_tensor * leaf = graph->leafs[i];
- ggml_gallocr_allocate_node(galloc, leaf, get_node_buffer_id(leaf_buffer_ids, i));
- }
-
- // count number of children and views
- // allocate other graph inputs and leafs first to avoid overwriting them
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
-
- // TODO: better way to add external dependencies
- // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
- // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
- // itself is never used and should not be considered a dependency
- if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
- struct ggml_tensor * view_src = node->view_src;
- ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
- }
-
- if (node->flags & GGML_TENSOR_FLAG_INPUT) {
- ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
- }
-
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
- continue;
- }
-
- ggml_gallocr_hash_get(galloc, src)->n_children += 1;
-
- // allocate explicit inputs
- if (src->flags & GGML_TENSOR_FLAG_INPUT) {
- ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
- }
- }
- }
-
- // allocate tensors
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- int buffer_id = get_node_buffer_id(node_buffer_ids, i);
-
- // allocate parents (only leafs need to be allocated at this point)
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * parent = node->src[j];
- if (parent == NULL) {
- continue;
- }
- ggml_gallocr_allocate_node(galloc, parent, buffer_id);
- }
-
- // allocate node
- ggml_gallocr_allocate_node(galloc, node, buffer_id);
-
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * parent = node->src[j];
- if (parent == NULL) {
- continue;
- }
- AT_PRINTF("%s", parent->name);
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
- AT_PRINTF(", ");
- }
- }
- AT_PRINTF("\n");
-
- // update parents
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * parent = node->src[j];
- if (parent == NULL) {
- continue;
- }
- struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
- p_hn->n_children -= 1;
-
- AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
- parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
-
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
- if (ggml_is_view(parent)) {
- struct ggml_tensor * view_src = parent->view_src;
- struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
- view_src_hn->n_views -= 1;
- AT_PRINTF("view_src %s: %d children, %d views\n",
- view_src->name, view_src_hn->n_children, view_src_hn->n_views);
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
- ggml_gallocr_free_node(galloc, view_src, buffer_id);
- }
- }
- else if (p_hn->allocated) {
- ggml_gallocr_free_node(galloc, parent, buffer_id);
- }
- }
- AT_PRINTF("\n");
- }
- }
-}
-
-bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
- size_t hash_size = graph->visited_hash_table.size;
-
- // initialize hash table
- if (galloc->hash_set.size < hash_size) {
- free(galloc->hash_set.keys);
- free(galloc->hash_values);
- galloc->hash_set.size = hash_size;
- galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
- galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
- GGML_ASSERT(galloc->hash_set.keys != NULL);
- GGML_ASSERT(galloc->hash_values != NULL);
- } else {
- // reset hash table
- memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
- }
-
- // reset allocators
- for (int i = 0; i < galloc->n_buffers; i++) {
- ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
- }
-
- // allocate in hash table
- ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids);
-
- // set the node_allocs from the hash table
- if (galloc->n_nodes < graph->n_nodes) {
- free(galloc->node_allocs);
- galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc));
- GGML_ASSERT(galloc->node_allocs != NULL);
- }
- galloc->n_nodes = graph->n_nodes;
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- struct node_alloc * node_alloc = &galloc->node_allocs[i];
- node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
- if (node->view_src || node->data) {
- node_alloc->dst.offset = SIZE_MAX;
- node_alloc->dst.size_max = 0;
- } else {
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
- node_alloc->dst.offset = hn->offset;
- node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
- }
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (!src || src->view_src || src->data) {
- node_alloc->src[j].offset = SIZE_MAX;
- node_alloc->src[j].size_max = 0;
- } else {
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
- node_alloc->src[j].offset = hn->offset;
- node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
- }
- }
- }
- if (galloc->n_leafs < graph->n_leafs) {
- free(galloc->leaf_allocs);
- galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0]));
- GGML_ASSERT(galloc->leaf_allocs != NULL);
- }
- galloc->n_leafs = graph->n_leafs;
- for (int i = 0; i < graph->n_leafs; i++) {
- struct ggml_tensor * leaf = graph->leafs[i];
- struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
- galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
- if (leaf->view_src || leaf->data) {
- galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
- galloc->leaf_allocs[i].leaf.size_max = 0;
- } else {
- galloc->leaf_allocs[i].leaf.offset = hn->offset;
- galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
- }
- }
-
- // reallocate buffers if needed
- for (int i = 0; i < galloc->n_buffers; i++) {
- size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
- size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
-
- // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
- if (new_size > cur_size || galloc->buffers[i] == NULL) {
-#ifndef NDEBUG
- fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
-#endif
- ggml_backend_buffer_free(galloc->buffers[i]);
- galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
- if (galloc->buffers[i] == NULL) {
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
- return false;
- }
- }
- }
-
- return true;
-}
-
-bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
- return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
-}
-
-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
- assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
-
- if (tensor->view_src != NULL) {
- if (tensor->buffer == NULL) {
- assert(tensor_alloc->offset == SIZE_MAX);
- if (tensor->view_src->buffer == NULL) {
- // this tensor was allocated without ggml-backend
- return;
- }
- ggml_backend_view_init(galloc->buffers[buffer_id], tensor);
- }
- } else {
- if (tensor->data == NULL) {
- assert(tensor_alloc->offset != SIZE_MAX);
- assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
- void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
- void * addr = (char *)base + tensor_alloc->offset;
- ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
- } else {
- if (tensor->buffer == NULL) {
- // this tensor was allocated without ggml-backend
- return;
- }
- }
- }
-}
-
-static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
- ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
- size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
- return talloc->size_max >= node_size;
-}
-
-static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
- if (galloc->n_nodes != graph->n_nodes) {
-#ifndef NDEBUG
- fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
-#endif
- return true;
- }
-
- if (galloc->n_leafs != graph->n_leafs) {
-#ifndef NDEBUG
- fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
-#endif
- return true;
- }
-
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- struct node_alloc * node_alloc = &galloc->node_allocs[i];
-
- if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
-#ifndef NDEBUG
- fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
-#endif
- return true;
- }
-
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
- continue;
- }
- if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
-#ifndef NDEBUG
- fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
-#endif
- return true;
- }
- }
- }
-
- return false;
-}
-
-bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
- if (ggml_gallocr_needs_realloc(galloc, graph)) {
- if (galloc->n_buffers == 1) {
-#ifndef NDEBUG
- fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
-#endif
- if (!ggml_gallocr_reserve(galloc, graph)) {
- return false;
- }
- } else {
-#ifndef NDEBUG
- fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
-#endif
- return false;
- }
- }
-
- // reset buffers
- for (int i = 0; i < galloc->n_buffers; i++) {
- if (galloc->buffers[i] != NULL) {
- ggml_backend_buffer_reset(galloc->buffers[i]);
- }
- }
-
- // allocate the graph tensors from the previous assignments
- // leafs
- for (int i = 0; i < graph->n_leafs; i++) {
- struct ggml_tensor * leaf = graph->leafs[i];
- struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
- ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
- }
- // nodes
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- struct node_alloc * node_alloc = &galloc->node_allocs[i];
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
- continue;
- }
- ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
- }
- ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
- }
-
- return true;
-}
-
-size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
- GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
-
- if (galloc->buffers[buffer_id] == NULL) {
- return 0;
- }
- return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
-}
-
-// utils
-
-static bool alloc_tensor_range(struct ggml_context * ctx,
- struct ggml_tensor * first, struct ggml_tensor * last,
- ggml_backend_buffer_type_t buft, size_t size,
- ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
- if (buffer == NULL) {
-#ifndef NDEBUG
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
-#endif
- for (size_t i = 0; i < *n_buffers; i++) {
- ggml_backend_buffer_free(*buffers[i]);
- }
- free(*buffers);
- return false;
- }
-
- struct ggml_tallocr tallocr = ggml_tallocr_new(buffer);
-
- for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
- if (t->data == NULL) {
- if (t->view_src == NULL) {
- ggml_tallocr_alloc(&tallocr, t);
- } else if (t->buffer == NULL) {
- ggml_backend_view_init(buffer, t);
- }
- } else {
- if (t->view_src != NULL && t->buffer == NULL) {
- // view of a pre-allocated tensor
- ggml_backend_view_init(buffer, t);
- }
- }
- }
-
- *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
- (*buffers)[(*n_buffers)++] = buffer;
-
- return true;
-}
-
-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
- GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
-
- size_t alignment = ggml_backend_buft_get_alignment(buft);
- size_t max_size = ggml_backend_buft_get_max_size(buft);
-
- ggml_backend_buffer_t * buffers = NULL;
- size_t n_buffers = 0;
-
- size_t cur_buf_size = 0;
- struct ggml_tensor * first = ggml_get_first_tensor(ctx);
- for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
- size_t this_size = 0;
- if (t->data == NULL && t->view_src == NULL) {
- this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
- }
-
- if (this_size > max_size) {
- fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
- __func__, t->name,
- ggml_backend_buft_name(buft),
- this_size, max_size);
- for (size_t i = 0; i < n_buffers; i++) {
- ggml_backend_buffer_free(buffers[i]);
- }
- free(buffers);
- return NULL;
- }
-
- if ((cur_buf_size + this_size) > max_size) {
- // allocate tensors in the current buffer
- if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
- return NULL;
- }
- first = t;
- cur_buf_size = this_size;
- } else {
- cur_buf_size += this_size;
- }
- }
-
- // allocate remaining tensors
- if (cur_buf_size > 0) {
- if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
- return NULL;
- }
- }
-
- if (n_buffers == 0) {
-#ifndef NDEBUG
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
-#endif
- return NULL;
- }
-
- ggml_backend_buffer_t buffer;
- if (n_buffers == 1) {
- buffer = buffers[0];
- } else {
- buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
- }
- free(buffers);
- return buffer;
-}
-
-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
- return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
-}
diff --git a/ggml-alloc.h b/ggml-alloc.h
deleted file mode 100644
index 434c13b34a9..00000000000
--- a/ggml-alloc.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct ggml_backend * ggml_backend_t;
-
-// Tensor allocator
-struct ggml_tallocr {
- ggml_backend_buffer_t buffer;
- void * base;
- size_t alignment;
- size_t offset;
-};
-
-GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
-GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
-
-// Graph allocator
-/*
- Example usage:
- ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
-
- // optional: create a worst-case graph and reserve the buffers to avoid reallocations
- ggml_gallocr_reserve(galloc, build_graph(max_batch));
-
- // allocate the graph
- struct ggml_cgraph * graph = build_graph(batch);
- ggml_gallocr_alloc_graph(galloc, graph);
-
- printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
-
- // evaluate the graph
- ggml_backend_graph_compute(backend, graph);
-*/
-
-// special tensor flags for use with the graph allocator:
-// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
-// ggml_set_output(): output tensors are never freed and never overwritten
-
-typedef struct ggml_gallocr * ggml_gallocr_t;
-
-GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
-GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
-GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
-
-// pre-allocate buffers from a measure graph - does not allocate or modify the graph
-// call with a worst-case graph to avoid buffer reallocations
-// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
-// returns false if the buffer allocation failed
-GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
-GGML_API bool ggml_gallocr_reserve_n(
- ggml_gallocr_t galloc,
- struct ggml_cgraph * graph,
- const int * node_buffer_ids,
- const int * leaf_buffer_ids);
-
-// automatic reallocation if the topology changes when using a single buffer
-// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
-GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
-
-GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
-
-// Utils
-// Create a buffer and allocate all the tensors in a ggml_context
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h
deleted file mode 100644
index f121e1de420..00000000000
--- a/ggml-backend-impl.h
+++ /dev/null
@@ -1,141 +0,0 @@
-#pragma once
-
-// ggml-backend internal header
-
-#include "ggml-backend.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- //
- // Backend buffer
- //
-
- // buffer type
- typedef void * ggml_backend_buffer_type_context_t;
-
- struct ggml_backend_buffer_type_i {
- const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
- ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
- size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
- size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
- size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
- bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
- // check if tensor data is in host memory
- // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
- bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
- };
-
- struct ggml_backend_buffer_type {
- struct ggml_backend_buffer_type_i iface;
- ggml_backend_buffer_type_context_t context;
- };
-
- // buffer
- typedef void * ggml_backend_buffer_context_t;
-
- struct ggml_backend_buffer_i {
- const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
- void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
- void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
- void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
- void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
- void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
- bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
- void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
- void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
- };
-
- struct ggml_backend_buffer {
- struct ggml_backend_buffer_i iface;
- ggml_backend_buffer_type_t buft;
- ggml_backend_buffer_context_t context;
- size_t size;
- enum ggml_backend_buffer_usage usage;
- };
-
- GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
- ggml_backend_buffer_type_t buft,
- struct ggml_backend_buffer_i iface,
- ggml_backend_buffer_context_t context,
- size_t size);
-
- // do not use directly, use ggml_backend_tensor_copy instead
- bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
-
- // buffer that contains a collection of buffers
- GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
- GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
- GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-
- //
- // Backend
- //
-
- typedef void * ggml_backend_context_t;
-
- struct ggml_backend_i {
- const char * (*GGML_CALL get_name)(ggml_backend_t backend);
-
- void (*GGML_CALL free)(ggml_backend_t backend);
-
- // buffer allocation
- ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
-
- // (optional) asynchronous tensor data access
- void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
- void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
- bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
-
- // (optional) complete all pending operations
- void (*GGML_CALL synchronize)(ggml_backend_t backend);
-
- // compute graph with a plan (not used currently)
- ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
- void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
- // compute graph with a plan
- enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
- // compute graph without a plan (async)
- enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
- // check if the backend supports an operation
- bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
-
- // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
- // these should be expensive operations with large batch sizes that may benefit from running on this backend
- // even if the weight has to be copied from the CPU temporarily
- bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
-
- // (optional) event synchronization
- ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
- void (*GGML_CALL event_free) (ggml_backend_event_t event);
- void (*GGML_CALL event_record) (ggml_backend_event_t event);
- void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
- void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
- };
-
- struct ggml_backend {
- ggml_guid_t guid;
-
- struct ggml_backend_i iface;
- ggml_backend_context_t context;
- };
-
- struct ggml_backend_event {
- ggml_backend_t backend;
- void * context;
- };
-
- //
- // Backend registry
- //
-
- typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
-
- GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/ggml-backend.c b/ggml-backend.c
deleted file mode 100644
index f5bdcf07838..00000000000
--- a/ggml-backend.c
+++ /dev/null
@@ -1,2101 +0,0 @@
-#include "ggml-backend-impl.h"
-#include "ggml-alloc.h"
-#include "ggml-impl.h"
-
-#include
-#include
-#include
-#include
-#include
-#include
-
-
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-// backend buffer type
-
-const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
- return buft->iface.get_name(buft);
-}
-
-GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
- return buft->iface.alloc_buffer(buft, size);
-}
-
-size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
- return buft->iface.get_alignment(buft);
-}
-
-size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
- // get_max_size is optional, defaults to SIZE_MAX
- if (buft->iface.get_max_size) {
- return buft->iface.get_max_size(buft);
- }
- return SIZE_MAX;
-}
-
-GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
- // get_alloc_size is optional, defaults to ggml_nbytes
- if (buft->iface.get_alloc_size) {
- size_t size = buft->iface.get_alloc_size(buft, tensor);
- assert(size >= ggml_nbytes(tensor));
- return size;
- }
- return ggml_nbytes(tensor);
-}
-
-bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- return buft->iface.supports_backend(buft, backend);
-}
-
-bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
- if (buft->iface.is_host) {
- return buft->iface.is_host(buft);
- }
- return false;
-}
-
-// backend buffer
-
-GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
- ggml_backend_buffer_type_t buft,
- struct ggml_backend_buffer_i iface,
- ggml_backend_buffer_context_t context,
- size_t size) {
- ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
-
- (*buffer) = (struct ggml_backend_buffer) {
- /* .interface = */ iface,
- /* .buft = */ buft,
- /* .context = */ context,
- /* .size = */ size,
- /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
- };
-
- return buffer;
-}
-
-const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
- return buffer->iface.get_name(buffer);
-}
-
-void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
- if (buffer == NULL) {
- return;
- }
-
- if (buffer->iface.free_buffer != NULL) {
- buffer->iface.free_buffer(buffer);
- }
- free(buffer);
-}
-
-size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
- return buffer->size;
-}
-
-void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
- void * base = buffer->iface.get_base(buffer);
-
- GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
-
- return base;
-}
-
-GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
- // init_tensor is optional
- if (buffer->iface.init_tensor) {
- buffer->iface.init_tensor(buffer, tensor);
- }
-}
-
-size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
- return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
-}
-
-size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
- return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
-}
-
-size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
- return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
-}
-
-void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
- buffer->iface.clear(buffer, value);
-}
-
-bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
- return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
-}
-
-void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
- buffer->usage = usage;
-
- // FIXME: add a generic callback to the buffer interface
- if (ggml_backend_buffer_is_multi_buffer(buffer)) {
- ggml_backend_multi_buffer_set_usage(buffer, usage);
- }
-}
-
-ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
- return buffer->buft;
-}
-
-void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
- if (buffer->iface.reset) {
- buffer->iface.reset(buffer);
- }
-}
-
-bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
- ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
- if (dst_buf->iface.cpy_tensor) {
- return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
- }
- return false;
-}
-
-// backend
-
-ggml_guid_t ggml_backend_guid(ggml_backend_t backend) {
- if (backend == NULL) {
- return NULL;
- }
- return backend->guid;
-}
-
-const char * ggml_backend_name(ggml_backend_t backend) {
- if (backend == NULL) {
- return "NULL";
- }
- return backend->iface.get_name(backend);
-}
-
-void ggml_backend_free(ggml_backend_t backend) {
- if (backend == NULL) {
- return;
- }
-
- backend->iface.free(backend);
-}
-
-ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
- return backend->iface.get_default_buffer_type(backend);
-}
-
-ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
- return ggml_backend_buft_alloc_buffer(ggml_backend_get_default_buffer_type(backend), size);
-}
-
-size_t ggml_backend_get_alignment(ggml_backend_t backend) {
- return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
-}
-
-size_t ggml_backend_get_max_size(ggml_backend_t backend) {
- return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
-}
-
-void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-
- if (backend->iface.set_tensor_async == NULL) {
- ggml_backend_tensor_set(tensor, data, offset, size);
- } else {
- backend->iface.set_tensor_async(backend, tensor, data, offset, size);
- }
-}
-
-void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-
- if (backend->iface.get_tensor_async == NULL) {
- ggml_backend_tensor_get(tensor, data, offset, size);
- } else {
- backend->iface.get_tensor_async(backend, tensor, data, offset, size);
- }
-}
-
-GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-
- if (!size) {
- return;
- }
-
- buf->iface.set_tensor(buf, tensor, data, offset, size);
-}
-
-GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
- ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
-
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-
- if (!size) {
- return;
- }
-
- buf->iface.get_tensor(buf, tensor, data, offset, size);
-}
-
-void ggml_backend_synchronize(ggml_backend_t backend) {
- if (backend->iface.synchronize == NULL) {
- return;
- }
-
- backend->iface.synchronize(backend);
-}
-
-ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
- GGML_ASSERT(backend->iface.graph_plan_create != NULL);
-
- return backend->iface.graph_plan_create(backend, cgraph);
-}
-
-void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
- GGML_ASSERT(backend->iface.graph_plan_free != NULL);
-
- backend->iface.graph_plan_free(backend, plan);
-}
-
-enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
- GGML_ASSERT(backend->iface.graph_plan_compute != NULL);
-
- return backend->iface.graph_plan_compute(backend, plan);
-}
-
-enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
- enum ggml_status err = ggml_backend_graph_compute_async(backend, cgraph);
- ggml_backend_synchronize(backend);
- return err;
-}
-
-enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
- return backend->iface.graph_compute(backend, cgraph);
-}
-
-bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
- return backend->iface.supports_op(backend, op);
-}
-
-bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
- if (backend->iface.offload_op != NULL) {
- return backend->iface.offload_op(backend, op);
- }
- return false;
-}
-
-// backend copy
-
-static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
- if (a->type != b->type) {
- return false;
- }
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
- if (a->ne[i] != b->ne[i]) {
- return false;
- }
- if (a->nb[i] != b->nb[i]) {
- return false;
- }
- }
- return true;
-}
-
-void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
- GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
-
- if (src == dst) {
- return;
- }
-
- if (ggml_backend_buffer_is_host(src->buffer)) {
- ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
- } else if (ggml_backend_buffer_is_host(dst->buffer)) {
- ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
- } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
-#ifndef NDEBUG
- fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
-#endif
- size_t nbytes = ggml_nbytes(src);
- void * data = malloc(nbytes);
- ggml_backend_tensor_get(src, data, 0, nbytes);
- ggml_backend_tensor_set(dst, data, 0, nbytes);
- free(data);
- }
-}
-
-void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst) {
- GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
-
- if (src == dst) {
- return;
- }
-
- if (backend_dst->iface.cpy_tensor_async != NULL) {
- if (backend_dst->iface.cpy_tensor_async(backend_src, backend_dst, src, dst)) {
- return;
- }
- }
-
- // an async copy would normally happen after all the queued operations on both backends are completed
- // sync src, set_async dst
- if (ggml_backend_buffer_is_host(src->buffer)) {
- ggml_backend_synchronize(backend_src);
- ggml_backend_tensor_set_async(backend_dst, dst, src->data, 0, ggml_nbytes(src));
- } else {
- ggml_backend_synchronize(backend_src);
- ggml_backend_tensor_copy(src, dst);
- ggml_backend_synchronize(backend_dst);
- }
-}
-
-// events
-
-ggml_backend_event_t ggml_backend_event_new(ggml_backend_t backend) {
- if (backend->iface.event_new == NULL) {
- return NULL;
- }
- return backend->iface.event_new(backend);
-}
-
-void ggml_backend_event_free(ggml_backend_event_t event) {
- if (event == NULL) {
- return;
- }
- event->backend->iface.event_free(event);
-}
-
-void ggml_backend_event_record(ggml_backend_event_t event) {
- GGML_ASSERT(event->backend->iface.event_record != NULL);
-
- event->backend->iface.event_record(event);
-}
-
-void ggml_backend_event_synchronize(ggml_backend_event_t event) {
- GGML_ASSERT(event->backend->iface.event_synchronize != NULL);
-
- event->backend->iface.event_synchronize(event);
-}
-
-void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
- GGML_ASSERT(backend->iface.event_wait != NULL);
-
- backend->iface.event_wait(backend, event);
-}
-
-// backend registry
-
-#define GGML_REG_MAX_BACKENDS 16
-
-struct ggml_backend_reg {
- char name[128];
- ggml_backend_init_fn init_fn;
- ggml_backend_buffer_type_t default_buffer_type;
- void * user_data;
-};
-
-static struct ggml_backend_reg ggml_backend_registry[GGML_REG_MAX_BACKENDS];
-static size_t ggml_backend_registry_count = 0;
-
-GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
-
-GGML_CALL static void ggml_backend_registry_init(void) {
- static bool initialized = false;
-
- if (initialized) {
- return;
- }
-
- initialized = true;
-
- ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL);
-
- // add forward decls here to avoid including the backend headers
-#ifdef GGML_USE_CUDA
- extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
- ggml_backend_cuda_reg_devices();
-#endif
-
-#ifdef GGML_USE_SYCL
- extern void ggml_backend_sycl_reg_devices(void);
- ggml_backend_sycl_reg_devices();
-#endif
-
-#ifdef GGML_USE_METAL
- extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
- extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
- ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
-#endif
-
-#ifdef GGML_USE_VULKAN
- extern GGML_CALL int ggml_backend_vk_reg_devices(void);
- ggml_backend_vk_reg_devices();
-#endif
-
-#ifdef GGML_USE_KOMPUTE
- extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
- ggml_backend_kompute_reg_devices();
-#endif
-}
-
-GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
- GGML_ASSERT(ggml_backend_registry_count < GGML_REG_MAX_BACKENDS);
-
- size_t id = ggml_backend_registry_count;
-
- ggml_backend_registry[id] = (struct ggml_backend_reg) {
- /* .name = */ {0},
- /* .fn = */ init_fn,
- /* .default_buffer_type = */ default_buffer_type,
- /* .user_data = */ user_data,
- };
-
- snprintf(ggml_backend_registry[id].name, sizeof(ggml_backend_registry[id].name), "%s", name);
-
-#ifndef NDEBUG
- fprintf(stderr, "%s: registered backend %s\n", __func__, name);
-#endif
-
- ggml_backend_registry_count++;
-}
-
-size_t ggml_backend_reg_get_count(void) {
- ggml_backend_registry_init();
-
- return ggml_backend_registry_count;
-}
-
-size_t ggml_backend_reg_find_by_name(const char * name) {
- ggml_backend_registry_init();
-
- for (size_t i = 0; i < ggml_backend_registry_count; i++) {
- // TODO: case insensitive in a portable way
- if (strcmp(ggml_backend_registry[i].name, name) == 0) {
- return i;
- }
- }
-
- // not found
- return SIZE_MAX;
-}
-
-// init from backend:params string
-ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) {
- ggml_backend_registry_init();
-
- const char * params = strchr(backend_str, ':');
- char backend_name[128];
- if (params == NULL) {
- snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
- params = "";
- } else {
- snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
- params++;
- }
-
- size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
-
- if (backend_i == SIZE_MAX) {
- fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
- return NULL;
- }
-
- return ggml_backend_reg_init_backend(backend_i, params);
-}
-
-const char * ggml_backend_reg_get_name(size_t i) {
- ggml_backend_registry_init();
-
- GGML_ASSERT(i < ggml_backend_registry_count);
- return ggml_backend_registry[i].name;
-}
-
-ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params) {
- ggml_backend_registry_init();
-
- GGML_ASSERT(i < ggml_backend_registry_count);
- return ggml_backend_registry[i].init_fn(params, ggml_backend_registry[i].user_data);
-}
-
-ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i) {
- ggml_backend_registry_init();
-
- GGML_ASSERT(i < ggml_backend_registry_count);
- return ggml_backend_registry[i].default_buffer_type;
-}
-
-ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
- ggml_backend_registry_init();
-
- GGML_ASSERT(i < ggml_backend_registry_count);
- return ggml_backend_buft_alloc_buffer(ggml_backend_registry[i].default_buffer_type, size);
-}
-
-// backend CPU
-
-static const size_t TENSOR_ALIGNMENT = 32; // required for mmap as gguf only guarantees 32-byte alignment
-
-GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
- return "CPU";
-
- GGML_UNUSED(buffer);
-}
-
-GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
- uintptr_t data = (uintptr_t)buffer->context;
-
- // align the buffer
- if (data % TENSOR_ALIGNMENT != 0) {
- data = GGML_PAD(data, TENSOR_ALIGNMENT);
- }
-
- return (void *)data;
-}
-
-GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
- free(buffer->context);
-}
-
-GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
- memcpy((char *)tensor->data + offset, data, size);
-
- GGML_UNUSED(buffer);
-}
-
-GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
- memcpy(data, (const char *)tensor->data + offset, size);
-
- GGML_UNUSED(buffer);
-}
-
-GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
- if (ggml_backend_buffer_is_host(src->buffer)) {
- memcpy(dst->data, src->data, ggml_nbytes(src));
- return true;
- }
- return false;
-
- GGML_UNUSED(buffer);
-}
-
-GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
- memset(buffer->context, value, buffer->size);
-}
-
-static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
- /* .get_name = */ ggml_backend_cpu_buffer_name,
- /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
- /* .init_tensor = */ NULL, // no initialization required
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
- /* .clear = */ ggml_backend_cpu_buffer_clear,
- /* .reset = */ NULL,
-};
-
-// for buffers from ptr, free is not called
-static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
- /* .get_name = */ ggml_backend_cpu_buffer_name,
- /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
- /* .get_base = */ ggml_backend_cpu_buffer_get_base,
- /* .init_tensor = */ NULL, // no initialization required
- /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
- /* .clear = */ ggml_backend_cpu_buffer_clear,
- /* .reset = */ NULL,
-};
-
-GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
- return "CPU";
-
- GGML_UNUSED(buft);
-}
-
-GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
- size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
- void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
- if (data == NULL) {
- fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
- return NULL;
- }
-
- return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
-}
-
-GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
- return TENSOR_ALIGNMENT;
-
- GGML_UNUSED(buft);
-}
-
-GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- return ggml_backend_is_cpu(backend);
-
- GGML_UNUSED(buft);
-}
-
-GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
- return true;
-
- GGML_UNUSED(buft);
-}
-
-GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
- /* .iface = */ {
- /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
- /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
- },
- /* .context = */ NULL,
- };
-
- return &ggml_backend_cpu_buffer_type;
-}
-
-#ifdef GGML_USE_CPU_HBM
-
-// buffer type HBM
-
-#include
-
-GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
- return "CPU_HBM";
-
- GGML_UNUSED(buft);
-}
-
-GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
- return "CPU_HBM";
-
- GGML_UNUSED(buf);
-}
-
-GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
- hbw_free(buffer->context);
-}
-
-GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
- //void * ptr = hbw_malloc(size);
- void * ptr;
- int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
- if (result != 0) {
- fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
- return NULL;
- }
-
- ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
- buffer->buft = buft;
- buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
- buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
-
- return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
- static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
- /* .iface = */ {
- /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
- /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
- /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
- /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
- },
- /* .context = */ NULL,
- };
-
- return &ggml_backend_cpu_buffer_type_hbm;
-}
-#endif
-
-struct ggml_backend_cpu_context {
- int n_threads;
- void * work_data;
- size_t work_size;
-
- ggml_abort_callback abort_callback;
- void * abort_callback_data;
-};
-
-GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
- return "CPU";
-
- GGML_UNUSED(backend);
-}
-
-GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
- free(cpu_ctx->work_data);
- free(cpu_ctx);
- free(backend);
-}
-
-GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
- return ggml_backend_cpu_buffer_type();
-
- GGML_UNUSED(backend);
-}
-
-struct ggml_backend_plan_cpu {
- struct ggml_cplan cplan;
- struct ggml_cgraph cgraph;
-};
-
-GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
- struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
-
- cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
- cpu_plan->cgraph = *cgraph; // FIXME: deep copy
-
- if (cpu_plan->cplan.work_size > 0) {
- cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
- if (cpu_plan->cplan.work_data == NULL) {
- free(cpu_plan);
- return NULL;
- }
- }
-
- cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
- cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-
- return cpu_plan;
-}
-
-GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
- free(cpu_plan->cplan.work_data);
- free(cpu_plan);
-
- GGML_UNUSED(backend);
-}
-
-GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
- struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
-
- return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
-
- GGML_UNUSED(backend);
-}
-
-GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
- struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
-
- struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
-
- if (cpu_ctx->work_size < cplan.work_size) {
- free(cpu_ctx->work_data);
- cpu_ctx->work_data = malloc(cplan.work_size);
- if (cpu_ctx->work_data == NULL) {
- cpu_ctx->work_size = 0;
- return GGML_STATUS_ALLOC_FAILED;
- }
- cpu_ctx->work_size = cplan.work_size;
- }
- cplan.work_data = cpu_ctx->work_data;
-
- cplan.abort_callback = cpu_ctx->abort_callback;
- cplan.abort_callback_data = cpu_ctx->abort_callback_data;
-
- return ggml_graph_compute(cgraph, &cplan);
-}
-
-GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
- switch (op->op) {
- case GGML_OP_CPY:
- return
- op->type != GGML_TYPE_IQ2_XXS &&
- op->type != GGML_TYPE_IQ2_XS &&
- op->type != GGML_TYPE_IQ1_S &&
- op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
- case GGML_OP_MUL_MAT:
- return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
- default:
- return true;
- }
-
- GGML_UNUSED(backend);
-}
-
-static struct ggml_backend_i cpu_backend_i = {
- /* .get_name = */ ggml_backend_cpu_name,
- /* .free = */ ggml_backend_cpu_free,
- /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
- /* .set_tensor_async = */ NULL,
- /* .get_tensor_async = */ NULL,
- /* .cpy_tensor_async = */ NULL,
- /* .synchronize = */ NULL,
- /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
- /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
- /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
- /* .graph_compute = */ ggml_backend_cpu_graph_compute,
- /* .supports_op = */ ggml_backend_cpu_supports_op,
- /* .offload_op = */ NULL,
- /* .event_new = */ NULL,
- /* .event_free = */ NULL,
- /* .event_record = */ NULL,
- /* .event_wait = */ NULL,
- /* .event_synchronize = */ NULL,
-};
-
-static ggml_guid_t ggml_backend_cpu_guid(void) {
- static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
- return &guid;
-}
-
-ggml_backend_t ggml_backend_cpu_init(void) {
- struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
- if (ctx == NULL) {
- return NULL;
- }
-
- ctx->n_threads = GGML_DEFAULT_N_THREADS;
- ctx->work_data = NULL;
- ctx->work_size = 0;
- ctx->abort_callback = NULL;
- ctx->abort_callback_data = NULL;
-
- ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
- if (cpu_backend == NULL) {
- free(ctx);
- return NULL;
- }
-
- *cpu_backend = (struct ggml_backend) {
- /* .guid = */ ggml_backend_cpu_guid(),
- /* .interface = */ cpu_backend_i,
- /* .context = */ ctx
- };
- return cpu_backend;
-}
-
-GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
- return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
-}
-
-void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
- ctx->n_threads = n_threads;
-}
-
-void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
- GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
-
- struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
- ctx->abort_callback = abort_callback;
- ctx->abort_callback_data = abort_callback_data;
-}
-
-GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
- GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
- return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
-}
-
-GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
- return ggml_backend_cpu_init();
-
- GGML_UNUSED(params);
- GGML_UNUSED(user_data);
-}
-
-// multi-buffer buffer
-
-struct ggml_backend_multi_buffer_context {
- ggml_backend_buffer_t * buffers;
- size_t n_buffers;
-};
-
-typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
-
-GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
-
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
-}
-
-GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
- for (size_t i = 0; i < ctx->n_buffers; i++) {
- ggml_backend_buffer_free(ctx->buffers[i]);
- }
-
- free(ctx->buffers);
- free(ctx);
-}
-
-GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
- for (size_t i = 0; i < ctx->n_buffers; i++) {
- ggml_backend_buffer_clear(ctx->buffers[i], value);
- }
-}
-
-static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
- static struct ggml_backend_buffer_i multi_backend_buffer_i = {
- /* .get_name = */ ggml_backend_multi_buffer_get_name,
- /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
- /* .get_base = */ NULL,
- /* .init_tensor = */ NULL,
- /* .set_tensor = */ NULL,
- /* .get_tensor = */ NULL,
- /* .cpy_tensor = */ NULL,
- /* .clear = */ ggml_backend_multi_buffer_clear,
- /* .reset = */ NULL,
- };
-
- return multi_backend_buffer_i;
-}
-
-GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
- ctx->n_buffers = n_buffers;
- ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
-
- GGML_ASSERT(ctx->buffers != NULL);
-
- size_t total_size = 0;
- for (size_t i = 0; i < n_buffers; i++) {
- ctx->buffers[i] = buffers[i];
- total_size += ggml_backend_buffer_get_size(buffers[i]);
- }
-
- return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
-}
-
-GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
- return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
-}
-
-GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
- GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
- ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
- for (size_t i = 0; i < ctx->n_buffers; i++) {
- ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
- }
-}
-
-// creates a copy of the tensor with the same memory layout
-static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
- struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
- dup->nb[i] = tensor->nb[i];
- }
- return dup;
-}
-
-static bool ggml_is_view_op(enum ggml_op op) {
- return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
-}
-
-// scheduler
-
-#ifndef GGML_SCHED_MAX_BACKENDS
-#define GGML_SCHED_MAX_BACKENDS 16
-#endif
-
-#ifndef GGML_SCHED_MAX_SPLITS
-#define GGML_SCHED_MAX_SPLITS 2048
-#endif
-
-#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
-#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
-#endif
-
-#ifndef GGML_SCHED_MAX_COPIES
-#define GGML_SCHED_MAX_COPIES 4
-#endif
-
-struct ggml_backend_sched_split {
- int backend_id;
- int i_start;
- int i_end;
- struct ggml_tensor * inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
- int n_inputs;
- // graph view of this split
- struct ggml_cgraph graph;
-};
-
-struct ggml_backend_sched {
- bool is_reset; // true if the scheduler has been reset since the last graph split
- bool is_alloc;
-
- int n_backends;
-
- ggml_backend_t backends[GGML_SCHED_MAX_BACKENDS];
- ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
- ggml_gallocr_t galloc;
-
- // hash keys of the nodes in the graph
- struct ggml_hash_set hash_set;
- // hash values
- int * tensor_backend_id;
- struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
-
- int * node_backend_ids; // [graph_size]
- int * leaf_backend_ids; // [graph_size]
-
- // copy of the graph with modified inputs
- struct ggml_cgraph * graph;
-
- // graph splits
- struct ggml_backend_sched_split * splits;
- int n_splits;
- int splits_capacity;
-
- // pipeline parallelism support
- int n_copies;
- int cur_copy;
- ggml_backend_event_t events[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
- struct ggml_tensor * graph_inputs[GGML_SCHED_MAX_SPLIT_INPUTS];
- int n_graph_inputs;
-
- struct ggml_context * ctx;
-
- ggml_backend_sched_eval_callback callback_eval;
- void * callback_eval_user_data;
-
- // align context_buffer to GGML_MEM_ALIGN
-#ifdef _MSC_VER
- __declspec(align(GGML_MEM_ALIGN))
-#else
- __attribute__((aligned(GGML_MEM_ALIGN)))
-#endif
- char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
-};
-
-#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor)
-#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)]
-
-// returns the priority of the backend, lower id is higher priority
-static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
- for (int i = 0; i < sched->n_backends; i++) {
- if (sched->backends[i] == backend) {
- return i;
- }
- }
- return -1;
-}
-
-static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
- ggml_backend_buffer_t buffer = tensor->buffer;
- if (buffer == NULL) {
- return -1;
- }
-
- // find highest prio backend that supports the buffer type
- for (int i = 0; i < sched->n_backends; i++) {
- if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
- return i;
- }
- }
-
- fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
- __func__, ggml_backend_buffer_name(buffer), tensor->name);
- GGML_ASSERT(false);
-
- return -1;
-}
-
-#if 0
-static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
-#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
-#define GET_CAUSE(node) causes[hash_id(node)]
-#else
-#define SET_CAUSE(node, ...)
-#define GET_CAUSE(node) ""
-#endif
-
-// returns the backend that should be used for the node based on the current locations
-static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
- // TODO: use supports_op to check if the backend supports the op
-
- // assign pre-allocated nodes to their backend
- int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
- if (cur_backend_id != -1) {
- SET_CAUSE(tensor, "1.dst");
- return cur_backend_id;
- }
-
- // view_src
- if (tensor->view_src != NULL) {
- cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
- if (cur_backend_id != -1) {
- SET_CAUSE(tensor, "1.vsrc");
- return cur_backend_id;
- }
- }
-
- // graph input
- if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
- cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
- SET_CAUSE(tensor, "1.inp");
- return cur_backend_id;
- }
-
- // assign nodes that use weights to the backend of the weights
- // operations with weights are preferably run on the same backend as the weights
- for (int i = 0; i < GGML_MAX_SRC; i++) {
- const struct ggml_tensor * src = tensor->src[i];
- if (src == NULL) {
- continue;
- }
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
- int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
- // check if a backend with higher prio wants to offload the op
- if (src_backend_id == sched->n_backends - 1) {
- for (int b = 0; b < src_backend_id; b++) {
- if (ggml_backend_offload_op(sched->backends[b], tensor)) {
- SET_CAUSE(tensor, "1.off");
- return b;
- }
- }
- }
- SET_CAUSE(tensor, "1.wgt%d", i);
- return src_backend_id;
- }
- }
-
- return -1;
-}
-
-static char * fmt_size(size_t size) {
- static char buffer[128];
- if (size >= 1024*1024) {
- sprintf(buffer, "%zuM", size/1024/1024);
- } else {
- sprintf(buffer, "%zuK", size/1024);
- }
- return buffer;
-}
-
-static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
- int cur_split = 0;
- for (int i = 0; i < graph->n_nodes; i++) {
- if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
- ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
- fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
- sched->splits[cur_split].n_inputs);
- for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
- fprintf(stderr, "[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
- fmt_size(ggml_nbytes(sched->splits[cur_split].inputs[j])));
- }
- fprintf(stderr, "\n");
- cur_split++;
- }
- struct ggml_tensor * node = graph->nodes[i];
- if (ggml_is_view_op(node->op)) {
- continue;
- }
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
- fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
- continue;
- }
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
- fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
- fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
- }
- fprintf(stderr, "\n");
- }
-}
-
-//#define DEBUG_PASS1
-//#define DEBUG_PASS2
-//#define DEBUG_PASS3
-//#define DEBUG_PASS4
-
-// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
-static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
- // reset splits
- sched->n_splits = 0;
- sched->n_graph_inputs = 0;
- sched->is_reset = false;
-
- struct ggml_init_params params = {
- /* .mem_size = */ sizeof(sched->context_buffer),
- /* .mem_buffer = */ sched->context_buffer,
- /* .no_alloc = */ true
- };
-
- ggml_free(sched->ctx);
-
- sched->ctx = ggml_init(params);
- if (sched->ctx == NULL) {
- fprintf(stderr, "%s: failed to initialize context\n", __func__);
- GGML_ASSERT(false);
- }
-
- // pass 1: assign backends to ops with pre-allocated inputs
- for (int i = 0; i < graph->n_leafs; i++) {
- struct ggml_tensor * leaf = graph->leafs[i];
- int * leaf_backend_id = &tensor_backend_id(leaf);
- if (*leaf_backend_id != -1) {
- // do not overwrite user assignments
- continue;
- }
- *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
- }
-
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- int * node_backend_id = &tensor_backend_id(node);
- if (*node_backend_id != -1) {
- // do not overwrite user assignments
- continue;
- }
- *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
- // src
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
- continue;
- }
- int * src_backend_id = &tensor_backend_id(src);
- if (*src_backend_id == -1) {
- *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
- }
- }
- }
-#ifdef DEBUG_PASS1
- fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
-
- // pass 2: expand current backend assignments
- // assign the same backend to adjacent nodes
- // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
- // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
-
-
- // pass 2.2 expand gpu down
- {
- int cur_backend_id = -1;
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- if (ggml_is_view_op(node->op)) {
- continue;
- }
- int * node_backend_id = &tensor_backend_id(node);
- if (*node_backend_id != -1) {
- if (*node_backend_id == sched->n_backends - 1) {
- // skip cpu (lowest prio backend)
- cur_backend_id = -1;
- } else {
- cur_backend_id = *node_backend_id;
- }
- } else {
- *node_backend_id = cur_backend_id;
- SET_CAUSE(node, "2.2");
- }
- }
- }
- // pass 2.1 expand gpu up
- {
- int cur_backend_id = -1;
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
- struct ggml_tensor * node = graph->nodes[i];
- if (ggml_is_view_op(node->op)) {
- continue;
- }
- int * node_backend_id = &tensor_backend_id(node);
- if (*node_backend_id != -1) {
- if (*node_backend_id == sched->n_backends - 1) {
- // skip cpu (lowest prio backend)
- cur_backend_id = -1;
- } else {
- cur_backend_id = *node_backend_id;
- }
- } else {
- *node_backend_id = cur_backend_id;
- SET_CAUSE(node, "2.1");
- }
- }
- }
- // pass 2.4 expand rest down
- {
- int cur_backend_id = -1;
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- if (ggml_is_view_op(node->op)) {
- continue;
- }
- int * node_backend_id = &tensor_backend_id(node);
- if (*node_backend_id != -1) {
- cur_backend_id = *node_backend_id;
- } else {
- *node_backend_id = cur_backend_id;
- SET_CAUSE(node, "2.4");
- }
- }
- }
- // pass 2.3 expand rest up
- {
- int cur_backend_id = -1;
- for (int i = graph->n_nodes - 1; i >= 0; i--) {
- struct ggml_tensor * node = graph->nodes[i];
- if (ggml_is_view_op(node->op)) {
- continue;
- }
- int * node_backend_id = &tensor_backend_id(node);
- if (*node_backend_id != -1) {
- cur_backend_id = *node_backend_id;
- } else {
- *node_backend_id = cur_backend_id;
- SET_CAUSE(node, "2.3");
- }
- }
- }
-
-#ifdef DEBUG_PASS2
- fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
-
- // pass 3: assign backends to remaining src from dst and view_src
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- int * cur_backend_id = &tensor_backend_id(node);
- if (node->view_src != NULL && *cur_backend_id == -1) {
- *cur_backend_id = tensor_backend_id(node->view_src);
- SET_CAUSE(node, "3.vsrc");
- }
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
- continue;
- }
- int * src_backend_id = &tensor_backend_id(src);
- if (*src_backend_id == -1) {
- if (src->view_src != NULL) {
- // views are always on the same backend as the source
- *src_backend_id = tensor_backend_id(src->view_src);
- SET_CAUSE(src, "3.vsrc");
- } else {
- *src_backend_id = *cur_backend_id;
- SET_CAUSE(src, "3.cur");
- }
- }
- }
- }
-#ifdef DEBUG_PASS3
- fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
-
- // pass 4: split graph, find tensors that need to be copied
- {
- int i_split = 0;
- struct ggml_backend_sched_split * split = &sched->splits[0];
- // find the backend of the first split, skipping view ops
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- if (!ggml_is_view_op(node->op)) {
- split->backend_id = tensor_backend_id(node);
- break;
- }
- }
- split->i_start = 0;
- split->n_inputs = 0;
- memset(split->inputs, 0, sizeof(split->inputs)); //HACK
- int cur_backend_id = split->backend_id;
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
-
- if (ggml_is_view_op(node->op)) {
- continue;
- }
-
- const int node_backend_id = tensor_backend_id(node);
-
- GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
-
- // check if we should start a new split based on the sources of the current node
- bool need_new_split = false;
- if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
- continue;
- }
- // check if a weight is on a different backend
- // by starting a new split, the memory of the previously offloaded weights can be reused
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
- int src_backend_id = tensor_backend_id(src);
- if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
- need_new_split = true;
- break;
- }
- }
- // check if the split has too many inputs
- if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
- const size_t id = hash_id(src);
- int src_backend_id = sched->tensor_backend_id[id];
- if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
- need_new_split = true;
- break;
- }
- }
- }
- }
-
- if (node_backend_id != cur_backend_id || need_new_split) {
- split->i_end = i;
- i_split++;
- if (i_split >= sched->splits_capacity) {
- sched->splits_capacity *= 2;
- sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
- GGML_ASSERT(sched->splits != NULL);
- }
- GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
- split = &sched->splits[i_split];
- split->backend_id = node_backend_id;
- split->i_start = i;
- split->n_inputs = 0;
- cur_backend_id = node_backend_id;
- }
-
- // find inputs that are not on the same backend
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
- continue;
- }
-
- const int src_backend_id = tensor_backend_id(src);
- assert(src_backend_id != -1); // all inputs should be assigned by now
-
- if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
- size_t id = hash_id(src);
- if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
- ggml_backend_t backend = sched->backends[src_backend_id];
- for (int c = 0; c < sched->n_copies; c++) {
- struct ggml_tensor * tensor_copy;
- if (c == sched->cur_copy) {
- tensor_copy = src; // use the original tensor as the current copy
- } else {
- tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
- ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
- }
- if (sched->n_copies > 1) {
- ggml_set_input(tensor_copy);
- ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
- }
- sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
- SET_CAUSE(tensor_copy, "4.cpy");
- }
- int n_graph_inputs = sched->n_graph_inputs++;
- GGML_ASSERT(n_graph_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
- sched->graph_inputs[n_graph_inputs] = src;
- }
- }
-
- if (src_backend_id != node_backend_id) {
- // create a copy of the input in the split's backend
- const size_t id = hash_id(src);
- if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
- ggml_backend_t backend = sched->backends[cur_backend_id];
- for (int c = 0; c < sched->n_copies; c++) {
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
- ggml_format_name(tensor_copy, "%s#%s#%d", ggml_backend_name(backend), src->name, c);
- if (sched->n_copies > 1) {
- ggml_set_input(tensor_copy);
- ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
- }
- sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
- SET_CAUSE(tensor_copy, "4.cpy");
- }
- int n_inputs = split->n_inputs++;
- GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
- split->inputs[n_inputs] = src;
- }
- node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
- }
- }
- }
- split->i_end = graph->n_nodes;
- sched->n_splits = i_split + 1;
- }
-#ifdef DEBUG_PASS4
- fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
-
- // create copies of the graph for each split
- // TODO: avoid this copy
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
- for (int i = 0; i < sched->n_splits; i++) {
- struct ggml_backend_sched_split * split = &sched->splits[i];
- split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
-
- // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
- for (int j = 0; j < split->n_inputs; j++) {
- assert(graph_copy->size > (graph_copy->n_nodes + 1));
-
- struct ggml_tensor * input = split->inputs[j];
- const size_t input_id = hash_id(input);
- struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
-
- // add a dependency to the input source so that it is not freed before the copy is done
- struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
- input_dep->src[0] = input;
- sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
- graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
-
- // add a dependency to the input copy so that it is allocated at the start of the split
- sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
- graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
- }
-
- for (int j = split->i_start; j < split->i_end; j++) {
- assert(graph_copy->size > graph_copy->n_nodes);
- sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
- graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
- }
- }
-
- if (sched->n_copies > 1) {
- // add input copies as leafs so that they are allocated first
- for (int i = 0; i < sched->n_graph_inputs; i++) {
- struct ggml_tensor * input = sched->graph_inputs[i];
- size_t id = hash_id(input);
- int backend_id = tensor_backend_id(input);
- for (int c = 0; c < sched->n_copies; c++) {
- struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
- }
- }
-
- for (int i = 0; i < sched->n_splits; i++) {
- struct ggml_backend_sched_split * split = &sched->splits[i];
- int backend_id = split->backend_id;
- for (int j = 0; j < split->n_inputs; j++) {
- struct ggml_tensor * input = split->inputs[j];
- size_t id = hash_id(input);
- for (int c = 0; c < sched->n_copies; c++) {
- struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c];
- sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
- graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
- }
- }
- }
- }
-
- // add leafs from the original graph
- for (int i = 0; i < graph->n_leafs; i++) {
- struct ggml_tensor * leaf = graph->leafs[i];
- sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
- graph_copy->leafs[graph_copy->n_leafs++] = leaf;
- }
-
- sched->graph = graph_copy;
-}
-
-static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
- // allocate graph
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
- // the re-allocation may cause the split inputs to be moved to a different address
- ggml_backend_sched_synchronize(sched);
-#ifndef NDEBUG
- fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__);
-#endif
- ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
- fprintf(stderr, "%s: failed to allocate graph\n", __func__);
- return false;
- }
- }
-
- return true;
-}
-
-static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
- struct ggml_backend_sched_split * splits = sched->splits;
-
- for (int i = 0; i < sched->n_splits; i++) {
- struct ggml_backend_sched_split * split = &splits[i];
- int split_backend_id = split->backend_id;
- ggml_backend_t split_backend = sched->backends[split_backend_id];
-
- // copy the input tensors to the split backend
- for (int j = 0; j < split->n_inputs; j++) {
- ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
- struct ggml_tensor * input = split->inputs[j];
- struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy];
-
- if (input->flags & GGML_TENSOR_FLAG_INPUT) {
- // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
- ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
- } else {
- ggml_backend_synchronize(split_backend);
- }
- ggml_backend_tensor_copy(input, input_cpy);
- } else {
- // wait for the split backend to finish using the input before overwriting it
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
- ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
- } else {
- ggml_backend_synchronize(split_backend);
- }
- ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
- }
- }
-
- if (!sched->callback_eval) {
- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph);
- if (ec != GGML_STATUS_SUCCESS) {
- return ec;
- }
- } else {
- // similar to ggml_backend_compare_graph_backend
- for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
- struct ggml_tensor * t = split->graph.nodes[j0];
-
- // check if the user needs data from this node
- bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
-
- int j1 = j0;
-
- // determine the range [j0, j1] of nodes that can be computed together
- while (!need && j1 < split->graph.n_nodes - 1) {
- t = split->graph.nodes[++j1];
- need = sched->callback_eval(t, true, sched->callback_eval_user_data);
- }
-
- struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
-
- enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &gv);
- if (ec != GGML_STATUS_SUCCESS) {
- return ec;
- }
-
- // TODO: pass backend to the callback, then the user can decide if they want to synchronize
- ggml_backend_synchronize(split_backend);
-
- if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
- break;
- }
-
- j0 = j1;
- }
- }
-
- // record the event of this copy
- if (split->n_inputs > 0) {
- if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
- ggml_backend_event_record(sched->events[split_backend_id][sched->cur_copy]);
- }
- }
- }
-
- sched->cur_copy = (sched->cur_copy + 1) % sched->n_copies;
-
- return GGML_STATUS_SUCCESS;
-}
-
-ggml_backend_sched_t ggml_backend_sched_new(
- ggml_backend_t * backends,
- ggml_backend_buffer_type_t * bufts,
- int n_backends,
- size_t graph_size,
- bool parallel) {
- GGML_ASSERT(n_backends > 0);
- GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
- GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
-
- struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
-
- // initialize hash table
- sched->hash_set = ggml_hash_set_new(graph_size);
- sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
- sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0]));
-
- const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
- sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
- sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
-
- sched->n_backends = n_backends;
-
- sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
-
- const int initial_splits_capacity = 16;
- sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
- sched->splits_capacity = initial_splits_capacity;
-
- for (int b = 0; b < n_backends; b++) {
- sched->backends[b] = backends[b];
- sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
- GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
- if (sched->n_copies > 1) {
- for (int c = 0; c < sched->n_copies; c++) {
- sched->events[b][c] = ggml_backend_event_new(backends[b]);
- }
- }
- }
-
- sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
-
- ggml_backend_sched_reset(sched);
-
- return sched;
-}
-
-void ggml_backend_sched_free(ggml_backend_sched_t sched) {
- if (sched == NULL) {
- return;
- }
- for (int b = 0; b < sched->n_backends; b++) {
- for (int c = 0; c < sched->n_copies; c++) {
- ggml_backend_event_free(sched->events[b][c]);
- }
- }
- ggml_gallocr_free(sched->galloc);
- ggml_free(sched->ctx);
- free(sched->splits);
- free(sched->hash_set.keys);
- free(sched->tensor_backend_id);
- free(sched->tensor_copies);
- free(sched->node_backend_ids);
- free(sched->leaf_backend_ids);
- free(sched);
-}
-
-void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
- // reset state for the next run
- if (!sched->is_reset) {
- size_t hash_size = sched->hash_set.size;
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
- memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
- memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
-
- sched->is_reset = true;
- }
- sched->is_alloc = false;
-}
-
-bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
- GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
-
- ggml_backend_sched_split_graph(sched, measure_graph);
-
- // TODO: extract this to a separate function
- if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
- return false;
- }
-
- ggml_backend_sched_reset(sched);
- ggml_backend_sched_synchronize(sched);
-
- return true;
-}
-
-bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
- GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
-
- ggml_backend_sched_split_graph(sched, graph);
-
- if (!ggml_backend_sched_alloc_splits(sched)) {
- return false;
- }
-
- sched->is_alloc = true;
-
- return true;
-}
-
-enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
- enum ggml_status err = ggml_backend_sched_graph_compute_async(sched, graph);
- ggml_backend_sched_synchronize(sched);
- return err;
-}
-
-enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
- if (!sched->is_reset && !sched->is_alloc) {
- ggml_backend_sched_reset(sched);
- }
-
- if (!sched->is_alloc) {
- if (!ggml_backend_sched_alloc_graph(sched, graph)) {
- return GGML_STATUS_ALLOC_FAILED;
- }
- }
-
- return ggml_backend_sched_compute_splits(sched);
-}
-
-void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
- for (int i = 0; i < sched->n_backends; i++) {
- ggml_backend_synchronize(sched->backends[i]);
- }
-}
-
-void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
- sched->callback_eval = callback;
- sched->callback_eval_user_data = user_data;
-}
-
-int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
- return sched->n_splits;
-}
-
-int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched) {
- return sched->n_copies;
-}
-
-size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
- int backend_index = ggml_backend_sched_backend_id(sched, backend);
- GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
-
- return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
-}
-
-void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
- int backend_index = ggml_backend_sched_backend_id(sched, backend);
- GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
- tensor_backend_id(node) = backend_index;
-}
-
-ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
- int backend_index = tensor_backend_id(node);
- if (backend_index == -1) {
- return NULL;
- }
- return sched->backends[backend_index];
-}
-
-// utils
-
-void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
- GGML_ASSERT(tensor->buffer == NULL);
- GGML_ASSERT(tensor->view_src != NULL);
- GGML_ASSERT(tensor->view_src->buffer != NULL);
- GGML_ASSERT(tensor->view_src->data != NULL);
-
- tensor->buffer = buffer;
- tensor->data = (char *)tensor->view_src->data + tensor->view_offs;
- tensor->backend = tensor->view_src->backend;
- ggml_backend_buffer_init_tensor(buffer, tensor);
-}
-
-void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr) {
- GGML_ASSERT(tensor->buffer == NULL);
- GGML_ASSERT(tensor->data == NULL);
- GGML_ASSERT(tensor->view_src == NULL);
- GGML_ASSERT(addr >= ggml_backend_buffer_get_base(buffer));
- GGML_ASSERT((char *)addr + ggml_backend_buffer_get_alloc_size(buffer, tensor) <=
- (char *)ggml_backend_buffer_get_base(buffer) + ggml_backend_buffer_get_size(buffer));
-
- tensor->buffer = buffer;
- tensor->data = addr;
- ggml_backend_buffer_init_tensor(buffer, tensor);
-}
-
-static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
- struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
-
- GGML_ASSERT(src != NULL);
- GGML_ASSERT(src->data && "graph must be allocated");
-
- size_t id = ggml_hash_insert(hash_set, src);
- if (id == GGML_HASHTABLE_ALREADY_EXISTS) {
- return node_copies[ggml_hash_find(hash_set, src)];
- }
-
- struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
- if (src->view_src != NULL) {
- dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
- dst->view_offs = src->view_offs;
- }
- dst->op = src->op;
- memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
- ggml_set_name(dst, src->name);
-
- // copy src
- for (int i = 0; i < GGML_MAX_SRC; i++) {
- struct ggml_tensor * s = src->src[i];
- if (s == NULL) {
- continue;
- }
- dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
- }
-
- node_copies[id] = dst;
- return dst;
-}
-
-static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
- size_t id = ggml_hash_find(hash_set, src);
- if (node_init[id]) {
- return;
- }
- node_init[id] = true;
-
- struct ggml_tensor * dst = node_copies[id];
- if (dst->view_src != NULL) {
- graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
- ggml_backend_view_init(dst->view_src->buffer, dst);
- }
- else {
- ggml_backend_tensor_copy(src, dst);
- }
-
- // init src
- for (int i = 0; i < GGML_MAX_SRC; i++) {
- struct ggml_tensor * s = src->src[i];
- if (s == NULL) {
- continue;
- }
- graph_copy_init_tensor(hash_set, node_copies, node_init, s);
- }
-}
-
-struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
- struct ggml_hash_set hash_set = {
- /* .size = */ graph->visited_hash_table.size,
- /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
- };
- struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
- bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
-
- struct ggml_init_params params = {
- /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
- /* .mem_buffer = */ NULL,
- /* .no_alloc = */ true
- };
-
- struct ggml_context * ctx_allocated = ggml_init(params);
- struct ggml_context * ctx_unallocated = ggml_init(params);
-
- if (ctx_allocated == NULL || ctx_unallocated == NULL) {
- fprintf(stderr, "failed to allocate context for graph copy\n");
- free(hash_set.keys);
- free(node_copies);
- free(node_init);
- ggml_free(ctx_allocated);
- ggml_free(ctx_unallocated);
- return (struct ggml_backend_graph_copy) {
- /* .buffer = */ NULL,
- /* .ctx_allocated = */ NULL,
- /* .ctx_unallocated = */ NULL,
- /* .graph = */ NULL,
- };
- }
-
- // dup nodes
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
- }
-
- // allocate nodes
- ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
- if (buffer == NULL) {
- fprintf(stderr, "failed to allocate buffer for graph copy\n");
- free(hash_set.keys);
- free(node_copies);
- free(node_init);
- ggml_free(ctx_allocated);
- ggml_free(ctx_unallocated);
- return (struct ggml_backend_graph_copy) {
- /* .buffer = */ NULL,
- /* .ctx_allocated = */ NULL,
- /* .ctx_unallocated = */ NULL,
- /* .graph = */ NULL,
- };
- }
-
- //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
-
- // copy data and init views
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- graph_copy_init_tensor(hash_set, node_copies, node_init, node);
- }
-
- // build graph copy
- struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)];
- graph_copy->nodes[i] = node_copy;
- }
- graph_copy->n_nodes = graph->n_nodes;
-
- free(hash_set.keys);
- free(node_copies);
- free(node_init);
-
- return (struct ggml_backend_graph_copy) {
- /* .buffer = */ buffer,
- /* .ctx_allocated = */ ctx_allocated,
- /* .ctx_unallocated = */ ctx_unallocated,
- /* .graph = */ graph_copy,
- };
-}
-
-void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
- ggml_backend_buffer_free(copy.buffer);
- ggml_free(copy.ctx_allocated);
- ggml_free(copy.ctx_unallocated);
-}
-
-bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
- struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
- if (copy.buffer == NULL) {
- return false;
- }
-
- struct ggml_cgraph * g1 = graph;
- struct ggml_cgraph * g2 = copy.graph;
-
- assert(g1->n_nodes == g2->n_nodes);
-
- for (int i = 0; i < g1->n_nodes; i++) {
- //printf("eval %d/%d\n", i, g1->n_nodes);
- struct ggml_tensor * t1 = g1->nodes[i];
- struct ggml_tensor * t2 = g2->nodes[i];
-
- assert(t1->op == t2->op && ggml_are_same_layout(t1, t2));
-
- struct ggml_cgraph g1v = ggml_graph_view(g1, i, i + 1);
- struct ggml_cgraph g2v = ggml_graph_view(g2, i, i + 1);
-
- ggml_backend_graph_compute(backend1, &g1v);
- ggml_backend_graph_compute(backend2, &g2v);
-
- if (ggml_is_view_op(t1->op)) {
- continue;
- }
-
- // compare results, calculate rms etc
- if (!callback(i, t1, t2, user_data)) {
- break;
- }
- }
-
- ggml_backend_graph_copy_free(copy);
-
- return true;
-}
diff --git a/ggml-backend.h b/ggml-backend.h
deleted file mode 100644
index 744b6a77457..00000000000
--- a/ggml-backend.h
+++ /dev/null
@@ -1,233 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
- typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
- typedef struct ggml_backend_event * ggml_backend_event_t;
- typedef struct ggml_backend * ggml_backend_t;
- typedef void * ggml_backend_graph_plan_t;
-
- //
- // Backend buffer
- //
-
- // buffer type
- GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
- GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
- GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
- GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
- GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
- GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
- GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
-
- // buffer
- enum ggml_backend_buffer_usage {
- GGML_BACKEND_BUFFER_USAGE_ANY = 0,
- GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
- };
-
- GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
- GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
- GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
- GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
- GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
- GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
- GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
- GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
- GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
- GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
- GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
- GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
- GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
-
- //
- // Backend
- //
-
- GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
- GGML_API const char * ggml_backend_name(ggml_backend_t backend);
- GGML_API void ggml_backend_free(ggml_backend_t backend);
-
- GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
- GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
- GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
- GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
-
- GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
- GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
-
- GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
- GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
-
- GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
-
- GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
- GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
- GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
- GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
- GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
- GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
- GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
-
- // tensor copy between different backends
- GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
-
- // asynchronous copy
- // the copy is performed after all the currently queued operations in backend_src
- // backend_dst will wait for the copy to complete before performing other operations
- // automatic fallback to sync copy if async is not supported
- GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
-
- // events
- GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
- GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
- GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
- GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
- GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
-
- //
- // CPU backend
- //
-
- GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-
- GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
- GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
- GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
-
- // Create a backend buffer from an existing pointer
- GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
-
- GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
-
-#ifdef GGML_USE_CPU_HBM
- GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-#endif
-
- //
- // Backend registry
- //
-
- // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
-
- GGML_API size_t ggml_backend_reg_get_count(void);
- GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
- GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
- GGML_API const char * ggml_backend_reg_get_name(size_t i);
- GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
- GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
- GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
-
- //
- // Backend scheduler
- //
-
- // The backend scheduler allows for multiple backends to be used together
- // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
- // The backends are selected based on:
- // - the backend that supports the operation
- // - the location of the pre-allocated tensors (e.g. the weights)
- /*
- Example usage:
-
- // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
- // preferrably to run on the same backend as the buffer
- ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-
- sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
-
- // initialize buffers from a max size graph (optional)
- reserve_graph = build_graph(sched, max_batch_size);
-
- // manually assign nodes to a backend (optional, should not be needed in most cases)
- struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
- ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
-
- ggml_backend_sched_reserve(sched, reserve_graph);
-
- // compute
- graph = build_graph(sched);
- ggml_backend_sched_graph_compute(sched, graph);
-
- // if there are graph inputs:
- ggml_backend_sched_reset(sched);
- ggml_backend_sched_alloc_graph(sched, graph);
- ggml_backend_tensor_set(input_tensor, ...);
- ggml_backend_sched_graph_compute(sched, graph);
- }
- */
-
- struct ggml_backend_sched;
- typedef struct ggml_backend_sched * ggml_backend_sched_t;
-
- // when ask == true, the scheduler wants to know if the user wants to observe this node
- // this allows the scheduler to batch nodes together in order to evaluate them in a single call
- //
- // when ask == false, the scheduler is passing the node tensor to the user for observation
- // if the user returns false, the scheduler will cancel the graph compute
- //
- typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
-
- // Initialize a backend scheduler
- GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
- GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
-
- // Initialize backend buffers from a measure graph
- GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
-
- // Get the number of splits of the last graph
- GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
- GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
-
- GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
-
- GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
- GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
-
- // Allocate and compute graph on the backend scheduler
- GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
- GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
- GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
- GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
-
- // Reset all assignments and allocators - must be called before changing the node backends
- GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
-
- // Set a callback to be called for each resulting node during graph compute
- GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
-
- //
- // Utils
- //
-
- struct ggml_backend_graph_copy {
- ggml_backend_buffer_t buffer;
- struct ggml_context * ctx_allocated;
- struct ggml_context * ctx_unallocated;
- struct ggml_cgraph * graph;
- };
-
- // Copy a graph to a different backend
- GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
- GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
-
- typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
-
- // Compare the output of two backends
- GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
-
- // Tensor initialization
- GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
- GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/ggml-common.h b/ggml-common.h
deleted file mode 100644
index 43c7978a098..00000000000
--- a/ggml-common.h
+++ /dev/null
@@ -1,1853 +0,0 @@
-#ifndef GGML_COMMON_DECL
-
-#if defined(GGML_COMMON_DECL_C)
-#include
-
-typedef uint16_t ggml_half;
-typedef uint32_t ggml_half2;
-
-#define GGML_COMMON_AGGR
-
-#define GGML_COMMON_DECL
-#elif defined(GGML_COMMON_DECL_METAL)
-#include
-
-typedef half ggml_half;
-typedef half2 ggml_half2;
-
-#define GGML_COMMON_AGGR
-
-#define GGML_COMMON_DECL
-#elif defined(GGML_COMMON_DECL_CUDA)
-#include
-#include
-
-typedef half ggml_half;
-typedef half2 ggml_half2;
-
-#define GGML_COMMON_AGGR data
-
-#define GGML_COMMON_DECL
-#elif defined(GGML_COMMON_DECL_HIP)
-#include
-#include
-
-typedef half ggml_half;
-typedef half2 ggml_half2;
-
-#define GGML_COMMON_AGGR data
-
-#define GGML_COMMON_DECL
-#elif defined(GGML_COMMON_DECL_SYCL)
-#include
-#include
-
-typedef sycl::half ggml_half;
-typedef sycl::half2 ggml_half2;
-
-#define GGML_COMMON_AGGR data
-
-#define GGML_COMMON_DECL
-#endif
-
-#if defined(GGML_COMMON_DECL)
-
-#ifndef __cplusplus
-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
-#endif // __cplusplus
-
-// QK = number of values after dequantization
-// QK_K = super-block size
-
-#ifdef GGML_QKK_64
-#define QK_K 64
-#define K_SCALE_SIZE 4
-#else
-#define QK_K 256
-#define K_SCALE_SIZE 12
-#endif // GGML_QKK_64
-
-#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
-// QR = QK / number of values before dequantization
-// QI = number of 32 bit integers before dequantization
-
-#define QI4_0 (QK4_0 / (4 * QR4_0))
-#define QR4_0 2
-
-#define QI4_1 (QK4_1 / (4 * QR4_1))
-#define QR4_1 2
-
-#define QI5_0 (QK5_0 / (4 * QR5_0))
-#define QR5_0 2
-
-#define QI5_1 (QK5_1 / (4 * QR5_1))
-#define QR5_1 2
-
-#define QI8_0 (QK8_0 / (4 * QR8_0))
-#define QR8_0 1
-
-#define QI8_1 (QK8_1 / (4 * QR8_1))
-#define QR8_1 1
-
-#define QI2_K (QK_K / (4*QR2_K))
-#define QR2_K 4
-
-#define QI3_K (QK_K / (4*QR3_K))
-#define QR3_K 4
-
-#define QI4_K (QK_K / (4*QR4_K))
-#define QR4_K 2
-
-#define QI5_K (QK_K / (4*QR5_K))
-#define QR5_K 2
-
-#define QI6_K (QK_K / (4*QR6_K))
-#define QR6_K 2
-
-#define QI2_XXS (QK_K / (4*QR2_XXS))
-#define QR2_XXS 8
-
-#define QI2_XS (QK_K / (4*QR2_XS))
-#define QR2_XS 8
-
-#define QI2_S (QK_K / (4*QR2_S))
-#define QR2_S 8
-
-#define QI3_XXS (QK_K / (4*QR3_XXS))
-#define QR3_XXS 8
-
-#define QI3_XS (QK_K / (4*QR3_XS))
-#define QR3_XS 8
-
-#define QI1_S (QK_K / (4*QR1_S))
-#define QR1_S 8
-
-#define QI4_NL (QK4_NL / (4*QR4_NL))
-#define QR4_NL 2
-
-#if QK_K == 64
-#define QI4_XS QI4_NL
-#define QR4_XS QR4_NL
-#else
-#define QI4_XS (QK_K / (4*QR4_XS))
-#define QR4_XS 8
-#endif
-
-#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
-
-#define QK4_0 32
-typedef struct {
- ggml_half d; // delta
- uint8_t qs[QK4_0 / 2]; // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-typedef struct {
- union {
- struct {
- ggml_half d; // delta
- ggml_half m; // min
- } GGML_COMMON_AGGR;
- ggml_half2 dm;
- };
- uint8_t qs[QK4_1 / 2]; // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK5_0 32
-typedef struct {
- ggml_half d; // delta
- uint8_t qh[4]; // 5-th bit of quants
- uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
-
-#define QK5_1 32
-typedef struct {
- union {
- struct {
- ggml_half d; // delta
- ggml_half m; // min
- } GGML_COMMON_AGGR;
- ggml_half2 dm;
- };
- uint8_t qh[4]; // 5-th bit of quants
- uint8_t qs[QK5_1 / 2]; // nibbles / quants
-} block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
-
-#define QK8_0 32
-typedef struct {
- ggml_half d; // delta
- int8_t qs[QK8_0]; // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block size/padding");
-
-#define QK8_1 32
-typedef struct {
- union {
- struct {
- ggml_half d; // delta
- ggml_half s; // d * sum(qs[i])
- } GGML_COMMON_AGGR;
- ggml_half2 ds;
- };
- int8_t qs[QK8_1]; // quants
-} block_q8_1;
-static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
-
-//
-// Super-block quantization structures
-//
-
-// 2-bit quantization
-// weight is represented as x = a * q + b
-// 16 blocks of 16 elements each
-// Effectively 2.625 bits per weight
-typedef struct {
- uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
- uint8_t qs[QK_K/4]; // quants
- union {
- struct {
- ggml_half d; // super-block scale for quantized scales
- ggml_half dmin; // super-block scale for quantized mins
- } GGML_COMMON_AGGR;
- ggml_half2 dm;
- };
-} block_q2_K;
-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
-
-// 3-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 3.4375 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
- uint8_t hmask[QK_K/8]; // quants - high bit
- uint8_t qs[QK_K/4]; // quants - low 2 bits
- uint8_t scales[2];
- ggml_half d; // super-block scale
-} block_q3_K;
-static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
-#else
-typedef struct {
- uint8_t hmask[QK_K/8]; // quants - high bit
- uint8_t qs[QK_K/4]; // quants - low 2 bits
- uint8_t scales[12]; // scales, quantized with 6 bits
- ggml_half d; // super-block scale
-} block_q3_K;
-static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
-#endif
-
-// 4-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 4.5 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
- ggml_half d[2]; // super-block scales/mins
- uint8_t scales[2]; // 4-bit block scales/mins
- uint8_t qs[QK_K/2]; // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
-#else
-typedef struct {
- union {
- struct {
- ggml_half d; // super-block scale for quantized scales
- ggml_half dmin; // super-block scale for quantized mins
- } GGML_COMMON_AGGR;
- ggml_half2 dm;
- };
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
- uint8_t qs[QK_K/2]; // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
-#endif
-
-// 5-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 5.5 bits per weight
-#ifdef GGML_QKK_64
-typedef struct {
- ggml_half d; // super-block scale
- int8_t scales[QK_K/16]; // 8-bit block scales
- uint8_t qh[QK_K/8]; // quants, high bit
- uint8_t qs[QK_K/2]; // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
-#else
-typedef struct {
- union {
- struct {
- ggml_half d; // super-block scale for quantized scales
- ggml_half dmin; // super-block scale for quantized mins
- } GGML_COMMON_AGGR;
- ggml_half2 dm;
- };
- uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
- uint8_t qh[QK_K/8]; // quants, high bit
- uint8_t qs[QK_K/2]; // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
-#endif
-
-// 6-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 6.5625 bits per weight
-typedef struct {
- uint8_t ql[QK_K/2]; // quants, lower 4 bits
- uint8_t qh[QK_K/4]; // quants, upper 2 bits
- int8_t scales[QK_K/16]; // scales, quantized with 8 bits
- ggml_half d; // super-block scale
-} block_q6_K;
-static_assert(sizeof(block_q6_K) == sizeof(ggml_half) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
-
-// This is only used for intermediate quantization and dot products
-typedef struct {
- float d; // delta
- int8_t qs[QK_K]; // quants
- int16_t bsums[QK_K/16]; // sum of quants in groups of 16
-} block_q8_K;
-static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
-
-// (Almost) "true" 2-bit quantization.
-// Due to the need to use blocks as per ggml design, it ends up using
-// 2.0625 bpw because of the 16-bit scale for each block of 256.
-typedef struct {
- ggml_half d;
- uint16_t qs[QK_K/8];
-} block_iq2_xxs;
-static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_half) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
-
-// 2.3125 bpw quants
-typedef struct {
- ggml_half d;
- uint16_t qs[QK_K/8];
- uint8_t scales[QK_K/32];
-} block_iq2_xs;
-static_assert(sizeof(block_iq2_xs) == sizeof(ggml_half) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
-
-// 2.5625 bpw quants
-typedef struct {
- ggml_half d;
- uint8_t qs[QK_K/4];
- uint8_t qh[QK_K/32];
- uint8_t scales[QK_K/32];
-} block_iq2_s;
-static_assert(sizeof(block_iq2_s) == sizeof(ggml_half) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
-
-// (Almost) "true" 3-bit quantization.
-// Due to the need to use blocks as per ggml design, it ends up using
-// 3.0625 bpw because of the 16-bit scale for each block of 256.
-typedef struct {
- ggml_half d;
- uint8_t qs[3*QK_K/8];
-} block_iq3_xxs;
-static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
-
-// 3.4375 bpw
-#if QK_K == 64
-#define IQ3S_N_SCALE 2
-#else
-#define IQ3S_N_SCALE QK_K/64
-#endif
-typedef struct {
- ggml_half d;
- uint8_t qs[QK_K/4];
- uint8_t qh[QK_K/32];
- uint8_t signs[QK_K/8];
- uint8_t scales[IQ3S_N_SCALE];
-} block_iq3_s;
-static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
-
-typedef struct {
- ggml_half d;
- uint8_t qs[QK_K/8];
- uint16_t qh[QK_K/32];
-} block_iq1_s;
-static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
-
-// 1.75 bpw
-typedef struct {
- uint8_t qs[QK_K/8]; // grid index, low 8 bits
- uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
-#if QK_K == 64
- ggml_half d;
-#endif
- uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64)
-} block_iq1_m;
-#if QK_K == 64
-static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
-#else
-static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
-#endif
-
-// Used by IQ1_M quants
-typedef union {
- ggml_half f16;
- uint16_t u16;
-} iq1m_scale_t;
-
-// Non-linear quants
-#define QK4_NL 32
-typedef struct {
- ggml_half d;
- uint8_t qs[QK4_NL/2];
-} block_iq4_nl;
-static_assert(sizeof(block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
-
-#if QK_K == 64
-#define block_iq4_xs block_iq4_nl
-#else
-typedef struct {
- ggml_half d;
- uint16_t scales_h;
- uint8_t scales_l[QK_K/64];
- uint8_t qs[QK_K/2];
-} block_iq4_xs;
-static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
-#endif
-
-#endif // GGML_COMMON_DECL
-#endif // GGML_COMMON_DECL
-
-////////////////////////////////////////////////////////////////////////////////
-
-#ifndef GGML_COMMON_IMPL
-
-#if defined(GGML_COMMON_IMPL_C)
-#include
-
-#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
-#define GGML_TABLE_END() };
-
-#define GGML_COMMON_IMPL
-#elif defined(GGML_COMMON_IMPL_METAL)
-#include
-
-#define GGML_TABLE_BEGIN(type, name, size) static const constant type name[size] = {
-#define GGML_TABLE_END() };
-
-#define GGML_COMMON_IMPL
-#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
-#include
-
-#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
-#define GGML_TABLE_END() };
-
-#define GGML_COMMON_IMPL
-#elif defined(GGML_COMMON_IMPL_SYCL)
-
-#include
-
-#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
-#define GGML_TABLE_END() };
-
-#define GGML_COMMON_IMPL
-#endif
-
-#if defined(GGML_COMMON_IMPL)
-
-GGML_TABLE_BEGIN(uint8_t, kmask_iq2xs, 8)
- 1, 2, 4, 8, 16, 32, 64, 128
-GGML_TABLE_END()
-
-GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128)
- 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
- 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
- 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
- 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
- 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
- 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
- 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
- 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
-GGML_TABLE_END()
-
-//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-GGML_TABLE_BEGIN(uint64_t, ksigns64, 128)
- 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff,
- 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff,
- 0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff,
- 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff,
- 0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff,
- 0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff,
- 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff,
- 0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff,
- 0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff,
- 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff,
- 0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff,
- 0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff,
- 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff,
- 0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff,
- 0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff,
- 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff,
- 0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff,
- 0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff,
- 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff,
- 0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff,
- 0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff,
- 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff,
- 0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff,
- 0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff,
- 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff,
- 0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff,
- 0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff,
- 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff,
- 0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff,
- 0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff,
- 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff,
- 0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff,
-GGML_TABLE_END()
-//#endif
-
-
-GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256)
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
- 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
- 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
- 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
- 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
- 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
- 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
- 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
- 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
- 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
- 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
- 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
- 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
- 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
- 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
- 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
- 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
- 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
- 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
- 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
- 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
- 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
- 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
- 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
- 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
- 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
- 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
- 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
- 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
- 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
- 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
- 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
- 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
- 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
- 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
- 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
- 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
- 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
- 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
- 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
- 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
- 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
- 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
- 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
- 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
- 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
- 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
- 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
- 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
- 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
- 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
- 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
- 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
- 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
- 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
- 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
- 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
- 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
- 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
- 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
- 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
- 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
-GGML_TABLE_END()
-
-GGML_TABLE_BEGIN(uint64_t, iq2xs_grid, 512)
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
- 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
- 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
- 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
- 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
- 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
- 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
- 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
- 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
- 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
- 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
- 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
- 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
- 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
- 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
- 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
- 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
- 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
- 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
- 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
- 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
- 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
- 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
- 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
- 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
- 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
- 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
- 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
- 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
- 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
- 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
- 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
- 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
- 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
- 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
- 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
- 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
- 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
- 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
- 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
- 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
- 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
- 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
- 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
- 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
- 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
- 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
- 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
- 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
- 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
- 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
- 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
- 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
- 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
- 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
- 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
- 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
- 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
- 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
- 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
- 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
- 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
- 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
- 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
- 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
- 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
- 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
- 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
- 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
- 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
- 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
- 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
- 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
- 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
- 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
- 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
- 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
- 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
- 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
- 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
- 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
- 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
- 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
- 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
- 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
- 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
- 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
- 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
- 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
- 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
- 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
- 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
- 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
- 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
- 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
- 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
- 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
- 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
- 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
- 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
- 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
- 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
- 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
- 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
- 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
- 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
- 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
- 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
- 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
- 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
- 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
- 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
- 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
- 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
- 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
- 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
- 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
- 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
- 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
- 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
- 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
- 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
- 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
- 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
-GGML_TABLE_END()
-
-GGML_TABLE_BEGIN(uint64_t, iq2s_grid, 1024)
- 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
- 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
- 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
- 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
- 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
- 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b,
- 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919,
- 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808,
- 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908,
- 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b,
- 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908,
- 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08,
- 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19,
- 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819,
- 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919,
- 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b,
- 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908,
- 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908,
- 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919,
- 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908,
- 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b,
- 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919,
- 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b,
- 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919,
- 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908,
- 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b,
- 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b,
- 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08,
- 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b,
- 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819,
- 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808,
- 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908,
- 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b,
- 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908,
- 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08,
- 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819,
- 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808,
- 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08,
- 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819,
- 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b,
- 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908,
- 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919,
- 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b,
- 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919,
- 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808,
- 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819,
- 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919,
- 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919,
- 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808,
- 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819,
- 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b,
- 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908,
- 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b,
- 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908,
- 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919,
- 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b,
- 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919,
- 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b,
- 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819,
- 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919,
- 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908,
- 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b,
- 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908,
- 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b,
- 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908,
- 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08,
- 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908,
- 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819,
- 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819,
- 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808,
- 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08,
- 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19,
- 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819,
- 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808,
- 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819,
- 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919,
- 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808,
- 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19,
- 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08,
- 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b,
- 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908,
- 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808,
- 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819,
- 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908,
- 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819,
- 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808,
- 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808,
- 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819,
- 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908,
- 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08,
- 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819,
- 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b,
- 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b,
- 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08,
- 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19,
- 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819,
- 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919,
- 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908,
- 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808,
- 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808,
- 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908,
- 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808,
- 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08,
- 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08,
- 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908,
- 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919,
- 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808,
- 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819,
- 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908,
- 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08,
- 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819,
- 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808,
- 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808,
- 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819,
- 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808,
- 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908,
- 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b,
- 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819,
- 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808,
- 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b,
- 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808,
- 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b,
- 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19,
- 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819,
- 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08,
- 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b,
- 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908,
- 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b,
- 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b,
- 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919,
- 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808,
- 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819,
- 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908,
- 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08,
- 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08,
- 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819,
- 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919,
- 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908,
- 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b,
- 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908,
- 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b,
- 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908,
- 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08,
- 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819,
- 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808,
- 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819,
- 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919,
- 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808,
- 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808,
- 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08,
- 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819,
- 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919,
- 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808,
- 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819,
- 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919,
- 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808,
- 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b,
- 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908,
- 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808,
- 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908,
- 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b,
- 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908,
- 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b,
- 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908,
- 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b,
- 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908,
- 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08,
- 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908,
- 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b,
- 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908,
- 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08,
- 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819,
- 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919,
- 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808,
- 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19,
- 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b,
- 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919,
- 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808,
- 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819,
- 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908,
- 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919,
- 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808,
- 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808,
- 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b,
- 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919,
- 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808,
- 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b,
- 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808,
- 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919,
- 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b,
- 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08,
- 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919,
- 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808,
- 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b,
- 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908,
- 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808,
- 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808,
- 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808,
- 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908,
- 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808,
- 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808,
- 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b,
- 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908,
- 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808,
- 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808,
- 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819,
- 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919,
- 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b,
- 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808,
- 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819,
- 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b,
- 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908,
- 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08,
- 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908,
- 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919,
- 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819,
- 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908,
- 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b,
- 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808,
- 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819,
- 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908,
- 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919,
- 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808,
- 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808,
- 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808,
- 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919,
- 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908,
- 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908,
- 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08,
- 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819,
- 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b,
- 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808,
- 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819,
- 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908,
- 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819,
- 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808,
- 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808,
- 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b,
- 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908,
- 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808,
- 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908,
- 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819,
- 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819,
- 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808,
- 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b,
- 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b,
- 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819,
- 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b,
- 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b,
- 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b,
- 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819,
- 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19,
- 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819,
- 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908,
- 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808,
- 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b,
-GGML_TABLE_END()
-
-GGML_TABLE_BEGIN(uint32_t, iq3xxs_grid, 256)
- 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414,
- 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14,
- 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404,
- 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e,
- 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c,
- 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c,
- 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34,
- 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c,
- 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c,
- 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04,
- 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c,
- 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414,
- 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434,
- 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c,
- 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e,
- 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24,
- 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24,
- 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c,
- 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c,
- 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14,
- 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414,
- 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e,
- 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404,
- 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c,
- 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c,
- 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14,
- 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c,
- 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c,
- 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14,
- 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14,
- 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c,
- 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
-GGML_TABLE_END()
-
-GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
- 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305,
- 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905,
- 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09,
- 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b,
- 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b,
- 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d,
- 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03,
- 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505,
- 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03,
- 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901,
- 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d,
- 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303,
- 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501,
- 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105,
- 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505,
- 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101,
- 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707,
- 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b,
- 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01,
- 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f,
- 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305,
- 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103,
- 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509,
- 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503,
- 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b,
- 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f,
- 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f,
- 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f,
- 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109,
- 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f,
- 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509,
- 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501,
- 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303,
- 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f,
- 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907,
- 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703,
- 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03,
- 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01,
- 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01,
- 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903,
- 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505,
- 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b,
- 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107,
- 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509,
- 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303,
- 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103,
- 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05,
- 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b,
- 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f,
- 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701,
- 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909,
- 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305,
- 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d,
- 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b,
- 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d,
- 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307,
- 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09,
- 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309,
- 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709,
- 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f,
- 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303,
- 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503,
- 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b,
- 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
-GGML_TABLE_END()
-
-#define NGRID_IQ1S 2048
-#define IQ1S_DELTA 0.125f
-#define IQ1M_DELTA 0.125f
-#if defined(GGML_COMMON_IMPL_C)
-GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S)
- 0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff,
- 0xffffffffffff0101, 0xffffffffff00ff00, 0xffffffffff000000, 0xffffffffff01ffff,
- 0xffffffffff01ff01, 0xffffffffff0101ff, 0xffffffffff010101, 0xffffffff00ff0000,
- 0xffffffff0000ff00, 0xffffffff000000ff, 0xffffffff00000001, 0xffffffff00010000,
- 0xffffffff01ffffff, 0xffffffff01ffff01, 0xffffffff01ff01ff, 0xffffffff01ff0101,
- 0xffffffff01000000, 0xffffffff0101ffff, 0xffffffff0101ff01, 0xffffffff010101ff,
- 0xffffffff01010101, 0xffffff00ffff00ff, 0xffffff00ffff0000, 0xffffff00ff00ff00,
- 0xffffff00ff0000ff, 0xffffff00ff000001, 0xffffff00ff000100, 0xffffff00ff000101,
- 0xffffff00ff010000, 0xffffff0000ffff00, 0xffffff0000ff0001, 0xffffff0000ff0100,
- 0xffffff000000ff01, 0xffffff0000000000, 0xffffff0000000101, 0xffffff000001ff00,
- 0xffffff00000100ff, 0xffffff0000010001, 0xffffff00000101ff, 0xffffff0001ff0000,
- 0xffffff000100ff00, 0xffffff00010000ff, 0xffffff0001000001, 0xffffff0001010000,
- 0xffffff01ffffffff, 0xffffff01ffffff01, 0xffffff01ffff01ff, 0xffffff01ffff0101,
- 0xffffff01ff000000, 0xffffff01ff01ffff, 0xffffff01ff01ff01, 0xffffff01ff0101ff,
- 0xffffff01ff010101, 0xffffff0100ff0000, 0xffffff010000ff00, 0xffffff0100000100,
- 0xffffff01000100ff, 0xffffff0100010100, 0xffffff0101ffffff, 0xffffff0101ffff01,
- 0xffffff0101ff01ff, 0xffffff0101ff0101, 0xffffff010100ff00, 0xffffff0101000000,
- 0xffffff0101000100, 0xffffff010101ffff, 0xffffff010101ff01, 0xffffff01010101ff,
- 0xffffff0101010101, 0xffff00ffff00ff00, 0xffff00ffff0000ff, 0xffff00ffff000001,
- 0xffff00ffff010000, 0xffff00ff00ffff00, 0xffff00ff00ff0100, 0xffff00ff00000000,
- 0xffff00ff00000101, 0xffff00ff000100ff, 0xffff00ff00010000, 0xffff00ff0100ff00,
- 0xffff00ff01000100, 0xffff00ff01010000, 0xffff0000ffffff00, 0xffff0000ffff00ff,
- 0xffff0000ffff0000, 0xffff0000ffff0001, 0xffff0000ff000000, 0xffff0000ff0001ff,
- 0xffff0000ff000101, 0xffff0000ff010100, 0xffff000000ffffff, 0xffff000000ff0000,
- 0xffff000000ff0101, 0xffff00000000ffff, 0xffff00000000ff00, 0xffff0000000000ff,
- 0xffff000000000000, 0xffff000000000001, 0xffff000000000100, 0xffff00000001ffff,
- 0xffff00000001ff01, 0xffff000000010000, 0xffff0000000101ff, 0xffff000000010101,
- 0xffff000001ffff00, 0xffff00000100ff00, 0xffff000001000000, 0xffff0000010001ff,
- 0xffff000001000101, 0xffff00000101ff00, 0xffff0000010100ff, 0xffff000001010000,
- 0xffff000001010001, 0xffff000001010100, 0xffff0001ff0000ff, 0xffff0001ff000100,
- 0xffff000100ffff00, 0xffff000100ff00ff, 0xffff00010000ffff, 0xffff00010000ff01,
- 0xffff000100000000, 0xffff0001000001ff, 0xffff00010001ffff, 0xffff00010001ff00,
- 0xffff000100010001, 0xffff000100010100, 0xffff000101ff0000, 0xffff00010100ff00,
- 0xffff0001010000ff, 0xffff000101000100, 0xffff01ffffffffff, 0xffff01ffffffff01,
- 0xffff01ffffff01ff, 0xffff01ffffff0101, 0xffff01ffff000000, 0xffff01ffff01ffff,
- 0xffff01ffff01ff01, 0xffff01ffff0101ff, 0xffff01ffff010101, 0xffff01ff00ff0000,
- 0xffff01ff0000ff00, 0xffff01ff00000001, 0xffff01ff00010000, 0xffff01ff01ffffff,
- 0xffff01ff01ffff01, 0xffff01ff01ff01ff, 0xffff01ff01ff0101, 0xffff01ff01000000,
- 0xffff01ff0101ffff, 0xffff01ff0101ff01, 0xffff01ff010101ff, 0xffff01ff01010101,
- 0xffff0100ffff0000, 0xffff0100ff00ff00, 0xffff0100ff0000ff, 0xffff0100ff000100,
- 0xffff0100ff0100ff, 0xffff0100ff010000, 0xffff010000ffff00, 0xffff01000000ffff,
- 0xffff01000000ff00, 0xffff010000000000, 0xffff01000001ff00, 0xffff0100000100ff,
- 0xffff010000010100, 0xffff01000100ff00, 0xffff0100010000ff, 0xffff010001000001,
- 0xffff010001000100, 0xffff010001010000, 0xffff0101ffffffff, 0xffff0101ffffff01,
- 0xffff0101ffff01ff, 0xffff0101ffff0101, 0xffff0101ff000000, 0xffff0101ff01ffff,
- 0xffff0101ff01ff01, 0xffff0101ff0101ff, 0xffff0101ff010101, 0xffff010100ff0000,
- 0xffff01010000ff00, 0xffff010100000100, 0xffff01010001ff00, 0xffff010100010000,
- 0xffff010101ffffff, 0xffff010101ffff01, 0xffff010101ff0000, 0xffff010101ff01ff,
- 0xffff010101ff0101, 0xffff010101000000, 0xffff01010101ffff, 0xffff01010101ff01,
- 0xffff0101010101ff, 0xffff010101010101, 0xff00ffffff00ffff, 0xff00ffffff00ff00,
- 0xff00ffffff0000ff, 0xff00ffffff000100, 0xff00ffffff0100ff, 0xff00ffffff010000,
- 0xff00ffff00ffff00, 0xff00ffff00ff00ff, 0xff00ffff0000ffff, 0xff00ffff00000000,
- 0xff00ffff000001ff, 0xff00ffff0001ff00, 0xff00ffff000100ff, 0xff00ffff00010000,
- 0xff00ffff00010100, 0xff00ffff0100ff00, 0xff00ffff010000ff, 0xff00ffff01000001,
- 0xff00ffff0101ff00, 0xff00ffff01010000, 0xff00ff00ffffff00, 0xff00ff00ffff00ff,
- 0xff00ff00ffff0001, 0xff00ff00ffff0100, 0xff00ff00ff00ffff, 0xff00ff00ff00ff01,
- 0xff00ff00ff000000, 0xff00ff00ff0001ff, 0xff00ff00ff01ff00, 0xff00ff00ff0100ff,
- 0xff00ff00ff010100, 0xff00ff0000ff0000, 0xff00ff0000ff0101, 0xff00ff000000ffff,
- 0xff00ff000000ff00, 0xff00ff000000ff01, 0xff00ff00000000ff, 0xff00ff0000000000,
- 0xff00ff0000000001, 0xff00ff0000000100, 0xff00ff000001ffff, 0xff00ff0000010000,
- 0xff00ff0001ff00ff, 0xff00ff000100ff01, 0xff00ff0001000000, 0xff00ff000101ff00,
- 0xff00ff00010100ff, 0xff00ff01ff00ff00, 0xff00ff01ff0000ff, 0xff00ff01ff000001,
- 0xff00ff01ff010000, 0xff00ff0100ffffff, 0xff00ff0100ff0001, 0xff00ff0100ff0100,
- 0xff00ff010000ff01, 0xff00ff0100000000, 0xff00ff01000001ff, 0xff00ff0100000101,
- 0xff00ff01000100ff, 0xff00ff0100010001, 0xff00ff0101ff0000, 0xff00ff010100ff00,
- 0xff00ff01010000ff, 0xff00ff0101000001, 0xff00ff0101010000, 0xff0000ffffffff00,
- 0xff0000ffffff0001, 0xff0000ffffff0100, 0xff0000ffff0000ff, 0xff0000ffff000000,
- 0xff0000ffff0001ff, 0xff0000ffff000100, 0xff0000ffff01ff00, 0xff0000ffff010001,
- 0xff0000ff00ffff00, 0xff0000ff00ff0000, 0xff0000ff00ff0001, 0xff0000ff00ff01ff,
- 0xff0000ff00ff0101, 0xff0000ff0000ff00, 0xff0000ff000000ff, 0xff0000ff00000000,
- 0xff0000ff00000001, 0xff0000ff00000100, 0xff0000ff0001ff01, 0xff0000ff00010000,
- 0xff0000ff000101ff, 0xff0000ff01ff00ff, 0xff0000ff01ff0100, 0xff0000ff0100ffff,
- 0xff0000ff010000ff, 0xff0000ff01000000, 0xff0000ff010001ff, 0xff0000ff01000100,
- 0xff0000ff01000101, 0xff0000ff0101ff00, 0xff0000ff010100ff, 0xff0000ff01010000,
- 0xff0000ff01010100, 0xff000000ffffff01, 0xff000000ffff0000, 0xff000000ffff0101,
- 0xff000000ff00ff00, 0xff000000ff0000ff, 0xff000000ff000000, 0xff000000ff000001,
- 0xff000000ff000100, 0xff000000ff01ffff, 0xff000000ff01ff01, 0xff000000ff010000,
- 0xff000000ff0101ff, 0xff000000ff010101, 0xff00000000ffff00, 0xff00000000ff00ff,
- 0xff00000000ff0000, 0xff00000000ff0001, 0xff0000000000ff00, 0xff0000000000ff01,
- 0xff000000000000ff, 0xff00000000000000, 0xff00000000000001, 0xff00000000000100,
- 0xff00000000000101, 0xff0000000001ff00, 0xff000000000100ff, 0xff00000000010000,
- 0xff00000000010001, 0xff00000000010100, 0xff00000001ffffff, 0xff00000001ffff01,
- 0xff00000001ff00ff, 0xff00000001ff0000, 0xff00000001ff01ff, 0xff00000001ff0101,
- 0xff0000000100ffff, 0xff0000000100ff00, 0xff000000010000ff, 0xff00000001000000,
- 0xff00000001000001, 0xff00000001000100, 0xff00000001000101, 0xff0000000101ffff,
- 0xff0000000101ff01, 0xff00000001010000, 0xff000001ffffff00, 0xff000001ffff00ff,
- 0xff000001ffff0000, 0xff000001ffff0001, 0xff000001ff000000, 0xff000001ff000001,
- 0xff000001ff0001ff, 0xff000001ff000101, 0xff000001ff01ff00, 0xff000001ff010001,
- 0xff00000100ffffff, 0xff00000100ffff01, 0xff00000100ff00ff, 0xff00000100ff0000,
- 0xff00000100ff01ff, 0xff00000100ff0101, 0xff0000010000ff00, 0xff00000100000000,
- 0xff00000100000001, 0xff000001000001ff, 0xff00000100000100, 0xff0000010001ff00,
- 0xff000001000100ff, 0xff00000100010000, 0xff000001000101ff, 0xff00000100010100,
- 0xff00000100010101, 0xff00000101ff0001, 0xff00000101ff0101, 0xff0000010100ff01,
- 0xff00000101000000, 0xff000001010100ff, 0xff00000101010100, 0xff0001ffff00ff00,
- 0xff0001ffff000001, 0xff0001ffff010000, 0xff0001ff00ffff00, 0xff0001ff00ff00ff,
- 0xff0001ff00ff0001, 0xff0001ff00ff0100, 0xff0001ff0000ffff, 0xff0001ff00000000,
- 0xff0001ff000001ff, 0xff0001ff00000101, 0xff0001ff0001ffff, 0xff0001ff0001ff00,
- 0xff0001ff000100ff, 0xff0001ff00010001, 0xff0001ff00010100, 0xff0001ff01ff0000,
- 0xff0001ff0100ff00, 0xff0001ff010000ff, 0xff0001ff01010000, 0xff000100ff00ffff,
- 0xff000100ff00ff01, 0xff000100ff000000, 0xff000100ff000101, 0xff000100ff01ff00,
- 0xff000100ff010000, 0xff00010000ffff01, 0xff00010000ff00ff, 0xff00010000ff0000,
- 0xff00010000ff01ff, 0xff0001000000ff00, 0xff000100000000ff, 0xff00010000000000,
- 0xff00010000000001, 0xff00010000000100, 0xff00010000000101, 0xff0001000001ffff,
- 0xff00010000010000, 0xff00010000010101, 0xff00010001ff0100, 0xff0001000100ff00,
- 0xff0001000100ff01, 0xff00010001000000, 0xff000100010001ff, 0xff0001000101ff00,
- 0xff00010001010001, 0xff00010001010100, 0xff000101ffff0100, 0xff000101ff000001,
- 0xff000101ff0100ff, 0xff000101ff010001, 0xff00010100ff00ff, 0xff00010100ff0001,
- 0xff00010100ff0100, 0xff0001010000ffff, 0xff0001010000ff01, 0xff00010100000000,
- 0xff000101000001ff, 0xff0001010001ff00, 0xff00010100010001, 0xff00010100010100,
- 0xff00010101ff0000, 0xff0001010100ff00, 0xff00010101000001, 0xff00010101000101,
- 0xff01ffffffffffff, 0xff01ffffffffff01, 0xff01ffffffff01ff, 0xff01ffffffff0101,
- 0xff01ffffff000000, 0xff01ffffff01ffff, 0xff01ffffff01ff01, 0xff01ffffff010000,
- 0xff01ffffff0101ff, 0xff01ffffff010101, 0xff01ffff00ff0000, 0xff01ffff0000ff00,
- 0xff01ffff00000100, 0xff01ffff0001ff00, 0xff01ffff00010000, 0xff01ffff01ffffff,
- 0xff01ffff01ffff01, 0xff01ffff01ff01ff, 0xff01ffff01ff0101, 0xff01ffff01000000,
- 0xff01ffff0101ffff, 0xff01ffff0101ff01, 0xff01ffff01010000, 0xff01ffff010101ff,
- 0xff01ffff01010101, 0xff01ff00ffff0000, 0xff01ff00ff00ff00, 0xff01ff00ff0000ff,
- 0xff01ff00ff000100, 0xff01ff00ff010000, 0xff01ff0000ffff01, 0xff01ff0000ff00ff,
- 0xff01ff0000ff0100, 0xff01ff0000000000, 0xff01ff00000001ff, 0xff01ff0000000101,
- 0xff01ff000001ff00, 0xff01ff00000100ff, 0xff01ff0000010000, 0xff01ff0000010001,
- 0xff01ff0001ff0000, 0xff01ff000100ffff, 0xff01ff0001000001, 0xff01ff0001000100,
- 0xff01ff0001010000, 0xff01ff01ffffff00, 0xff01ff01ffff01ff, 0xff01ff01ffff0101,
- 0xff01ff01ff00ff00, 0xff01ff01ff000000, 0xff01ff01ff01ffff, 0xff01ff01ff01ff01,
- 0xff01ff01ff0101ff, 0xff01ff01ff010101, 0xff01ff0100ff0000, 0xff01ff010000ff00,
- 0xff01ff0100000001, 0xff01ff0100000100, 0xff01ff0100010000, 0xff01ff0101ffff00,
- 0xff01ff0101ff01ff, 0xff01ff0101ff0101, 0xff01ff010100ff00, 0xff01ff0101000000,
- 0xff01ff010101ffff, 0xff01ff010101ff01, 0xff01ff01010101ff, 0xff01ff0101010101,
- 0xff0100ffffff0000, 0xff0100ffff0000ff, 0xff0100ffff000001, 0xff0100ffff000100,
- 0xff0100ffff010000, 0xff0100ff00ff00ff, 0xff0100ff00ff0000, 0xff0100ff00ff0001,
- 0xff0100ff00ff0100, 0xff0100ff0000ff01, 0xff0100ff00000000, 0xff0100ff000001ff,
- 0xff0100ff00000101, 0xff0100ff00010001, 0xff0100ff01ff0000, 0xff0100ff0100ff00,
- 0xff0100ff010000ff, 0xff0100ff01000100, 0xff0100ff0101ff00, 0xff0100ff01010000,
- 0xff010000ffff0100, 0xff010000ff000000, 0xff010000ff01ff00, 0xff010000ff010100,
- 0xff01000000ffffff, 0xff01000000ff0000, 0xff01000000ff01ff, 0xff0100000000ff00,
- 0xff010000000000ff, 0xff01000000000000, 0xff01000000000100, 0xff0100000001ff01,
- 0xff01000000010000, 0xff010000000101ff, 0xff01000001ff0100, 0xff0100000100ffff,
- 0xff010000010000ff, 0xff01000001000000, 0xff010000010001ff, 0xff01000001000101,
- 0xff0100000101ff00, 0xff010000010100ff, 0xff01000001010001, 0xff01000001010100,
- 0xff010001ffff0000, 0xff010001ff00ffff, 0xff010001ff00ff01, 0xff010001ff000100,
- 0xff010001ff010000, 0xff01000100ffff00, 0xff01000100ff0100, 0xff01000100000000,
- 0xff0100010001ffff, 0xff0100010001ff00, 0xff01000100010100, 0xff01000101ff00ff,
- 0xff01000101ff0001, 0xff0100010100ffff, 0xff01000101000101, 0xff0101ffffffffff,
- 0xff0101ffffffff01, 0xff0101ffffff01ff, 0xff0101ffffff0101, 0xff0101ffff000000,
- 0xff0101ffff01ffff, 0xff0101ffff01ff01, 0xff0101ffff0101ff, 0xff0101ffff010101,
- 0xff0101ff00ff0000, 0xff0101ff0000ff00, 0xff0101ff000000ff, 0xff0101ff00010000,
- 0xff0101ff01ffffff, 0xff0101ff01ffff01, 0xff0101ff01ff01ff, 0xff0101ff01ff0101,
- 0xff0101ff0101ffff, 0xff0101ff0101ff01, 0xff0101ff010101ff, 0xff0101ff01010101,
- 0xff010100ffff0100, 0xff010100ff00ff00, 0xff010100ff0000ff, 0xff010100ff000100,
- 0xff010100ff010000, 0xff01010000ff0001, 0xff01010000ff0100, 0xff0101000000ff01,
- 0xff01010000000000, 0xff0101000001ff00, 0xff010100000100ff, 0xff01010000010001,
- 0xff01010000010100, 0xff01010001ff0000, 0xff0101000100ffff, 0xff01010001000001,
- 0xff01010001000100, 0xff010100010100ff, 0xff01010001010000, 0xff010101ffffffff,
- 0xff010101ffffff01, 0xff010101ffff01ff, 0xff010101ffff0101, 0xff010101ff01ffff,
- 0xff010101ff01ff01, 0xff010101ff0101ff, 0xff010101ff010101, 0xff01010100ff0000,
- 0xff0101010000ff00, 0xff01010100000001, 0xff01010100000100, 0xff01010100010000,
- 0xff01010101ffffff, 0xff01010101ffff01, 0xff01010101ff01ff, 0xff01010101ff0101,
- 0xff01010101000000, 0xff0101010101ffff, 0xff0101010101ff01, 0xff010101010101ff,
- 0xff01010101010101, 0x00ffffffffff0000, 0x00ffffffff00ff00, 0x00ffffffff000001,
- 0x00ffffffff010000, 0x00ffffff00ff0100, 0x00ffffff0000ff01, 0x00ffffff00000000,
- 0x00ffffff000001ff, 0x00ffffff00000101, 0x00ffffff0001ff00, 0x00ffffff000100ff,
- 0x00ffffff00010001, 0x00ffffff010000ff, 0x00ffffff01000100, 0x00ffffff0101ff00,
- 0x00ffffff01010001, 0x00ffff00ffffffff, 0x00ffff00ffffff00, 0x00ffff00ffff00ff,
- 0x00ffff00ffff0001, 0x00ffff00ffff0100, 0x00ffff00ff00ff01, 0x00ffff00ff000000,
- 0x00ffff00ff000001, 0x00ffff00ff0001ff, 0x00ffff00ff000101, 0x00ffff00ff01ff00,
- 0x00ffff00ff010001, 0x00ffff00ff010100, 0x00ffff0000ff0000, 0x00ffff0000ff01ff,
- 0x00ffff0000ff0101, 0x00ffff000000ff00, 0x00ffff00000000ff, 0x00ffff0000000000,
- 0x00ffff0000000001, 0x00ffff0000000100, 0x00ffff0000000101, 0x00ffff0000010000,
- 0x00ffff00000101ff, 0x00ffff0000010101, 0x00ffff0001ffff00, 0x00ffff0001ff00ff,
- 0x00ffff0001ff0001, 0x00ffff000100ffff, 0x00ffff000100ff01, 0x00ffff0001000000,
- 0x00ffff000101ffff, 0x00ffff000101ff00, 0x00ffff000101ff01, 0x00ffff01ffff0000,
- 0x00ffff01ff00ff00, 0x00ffff01ff0000ff, 0x00ffff01ff000001, 0x00ffff01ff010000,
- 0x00ffff0100ffff00, 0x00ffff010000ff01, 0x00ffff0100000000, 0x00ffff0100000101,
- 0x00ffff01000100ff, 0x00ffff0100010100, 0x00ffff0101ff0100, 0x00ffff01010000ff,
- 0x00ffff0101010000, 0x00ff00ffffffff00, 0x00ff00ffff000000, 0x00ff00ffff000100,
- 0x00ff00ffff010100, 0x00ff00ff00ff0000, 0x00ff00ff00ff01ff, 0x00ff00ff00ff0101,
- 0x00ff00ff0000ff00, 0x00ff00ff000000ff, 0x00ff00ff00000000, 0x00ff00ff00000001,
- 0x00ff00ff0001ff00, 0x00ff00ff0001ff01, 0x00ff00ff00010000, 0x00ff00ff000101ff,
- 0x00ff00ff00010101, 0x00ff00ff01ffff00, 0x00ff00ff01ff0001, 0x00ff00ff01ff0100,
- 0x00ff00ff0100ffff, 0x00ff00ff0100ff01, 0x00ff00ff01000000, 0x00ff00ff0101ffff,
- 0x00ff00ff0101ff00, 0x00ff00ff01010100, 0x00ff0000ffffff00, 0x00ff0000ffffff01,
- 0x00ff0000ffff0000, 0x00ff0000ffff0101, 0x00ff0000ff00ff00, 0x00ff0000ff0000ff,
- 0x00ff0000ff000000, 0x00ff0000ff000001, 0x00ff0000ff000100, 0x00ff0000ff01ffff,
- 0x00ff0000ff010000, 0x00ff0000ff010101, 0x00ff000000ffff00, 0x00ff000000ff00ff,
- 0x00ff000000ff0000, 0x00ff000000ff0001, 0x00ff000000ff0100, 0x00ff00000000ffff,
- 0x00ff00000000ff00, 0x00ff0000000000ff, 0x00ff000000000000, 0x00ff000000000001,
- 0x00ff0000000001ff, 0x00ff000000000100, 0x00ff00000001ff00, 0x00ff0000000100ff,
- 0x00ff000000010000, 0x00ff000000010001, 0x00ff000000010100, 0x00ff000001ffff01,
- 0x00ff000001ff00ff, 0x00ff000001ff0000, 0x00ff000001ff01ff, 0x00ff00000100ff00,
- 0x00ff0000010000ff, 0x00ff000001000000, 0x00ff000001000001, 0x00ff000001000100,
- 0x00ff000001000101, 0x00ff000001010000, 0x00ff0000010101ff, 0x00ff000001010101,
- 0x00ff0001ffffff00, 0x00ff0001ffff0000, 0x00ff0001ffff0100, 0x00ff0001ff0000ff,
- 0x00ff0001ff000000, 0x00ff0001ff0001ff, 0x00ff0001ff000101, 0x00ff0001ff01ff00,
- 0x00ff0001ff0100ff, 0x00ff0001ff010100, 0x00ff000100ffffff, 0x00ff000100ffff01,
- 0x00ff000100ff0000, 0x00ff000100ff01ff, 0x00ff00010000ffff, 0x00ff00010000ff00,
- 0x00ff00010000ff01, 0x00ff000100000000, 0x00ff000100000001, 0x00ff000100000100,
- 0x00ff00010001ff01, 0x00ff000100010000, 0x00ff0001000101ff, 0x00ff000101ffff00,
- 0x00ff000101ff0000, 0x00ff000101ff0101, 0x00ff0001010000ff, 0x00ff000101000000,
- 0x00ff00010101ff00, 0x00ff0001010100ff, 0x00ff000101010001, 0x00ff01ffffff0000,
- 0x00ff01ffff00ff00, 0x00ff01ffff000000, 0x00ff01ffff000101, 0x00ff01ffff010000,
- 0x00ff01ff00ffff01, 0x00ff01ff00ff0100, 0x00ff01ff0000ffff, 0x00ff01ff00000000,
- 0x00ff01ff000001ff, 0x00ff01ff0001ff00, 0x00ff01ff000100ff, 0x00ff01ff00010001,
- 0x00ff01ff00010100, 0x00ff01ff01ff0000, 0x00ff01ff0100ff00, 0x00ff01ff010000ff,
- 0x00ff01ff01000001, 0x00ff01ff01000100, 0x00ff01ff01010000, 0x00ff0100ffffff00,
- 0x00ff0100ffff0000, 0x00ff0100ffff0001, 0x00ff0100ffff0101, 0x00ff0100ff00ffff,
- 0x00ff0100ff0000ff, 0x00ff0100ff000000, 0x00ff0100ff0001ff, 0x00ff0100ff01ff00,
- 0x00ff0100ff0100ff, 0x00ff0100ff010001, 0x00ff010000ffffff, 0x00ff010000ff0000,
- 0x00ff010000ff0101, 0x00ff01000000ff00, 0x00ff01000000ff01, 0x00ff0100000000ff,
- 0x00ff010000000000, 0x00ff010000000001, 0x00ff010000000100, 0x00ff01000001ffff,
- 0x00ff01000001ff01, 0x00ff010000010000, 0x00ff010000010001, 0x00ff010000010101,
- 0x00ff010001ff0001, 0x00ff010001ff0100, 0x00ff01000100ff01, 0x00ff010001000000,
- 0x00ff010001000001, 0x00ff0100010001ff, 0x00ff01000101ff00, 0x00ff0100010100ff,
- 0x00ff010001010001, 0x00ff010001010100, 0x00ff0101ff000001, 0x00ff010100ff00ff,
- 0x00ff010100ff0001, 0x00ff010100ff0100, 0x00ff010100000000, 0x00ff0101000001ff,
- 0x00ff010100000101, 0x00ff0101000100ff, 0x00ff010100010100, 0x00ff0101010000ff,
- 0x00ff010101010000, 0x0000ffffffffff00, 0x0000ffffffff00ff, 0x0000ffffffff0000,
- 0x0000ffffffff0001, 0x0000ffffffff0100, 0x0000ffffff00ff01, 0x0000ffffff000000,
- 0x0000ffffff000101, 0x0000ffffff01ff00, 0x0000ffffff0100ff, 0x0000ffffff010100,
- 0x0000ffff00ffffff, 0x0000ffff00ff0000, 0x0000ffff00ff01ff, 0x0000ffff0000ff00,
- 0x0000ffff000000ff, 0x0000ffff00000000, 0x0000ffff00000001, 0x0000ffff00000100,
- 0x0000ffff00010000, 0x0000ffff000101ff, 0x0000ffff01ff0001, 0x0000ffff01ff0100,
- 0x0000ffff01000000, 0x0000ffff010001ff, 0x0000ffff0101ffff, 0x0000ffff0101ff00,
- 0x0000ffff01010001, 0x0000ffff01010100, 0x0000ff00ffff0000, 0x0000ff00ffff01ff,
- 0x0000ff00ffff0100, 0x0000ff00ffff0101, 0x0000ff00ff00ff00, 0x0000ff00ff0000ff,
- 0x0000ff00ff000000, 0x0000ff00ff000001, 0x0000ff00ff0001ff, 0x0000ff00ff000100,
- 0x0000ff00ff01ffff, 0x0000ff00ff010000, 0x0000ff00ff010001, 0x0000ff00ff0101ff,
- 0x0000ff00ff010101, 0x0000ff0000ffff00, 0x0000ff0000ff00ff, 0x0000ff0000ff0000,
- 0x0000ff0000ff0001, 0x0000ff0000ff0100, 0x0000ff000000ffff, 0x0000ff000000ff00,
- 0x0000ff000000ff01, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001,
- 0x0000ff00000001ff, 0x0000ff0000000100, 0x0000ff0000000101, 0x0000ff000001ff00,
- 0x0000ff00000100ff, 0x0000ff0000010000, 0x0000ff0000010001, 0x0000ff0000010100,
- 0x0000ff0001ffff01, 0x0000ff0001ff0000, 0x0000ff000100ff00, 0x0000ff00010000ff,
- 0x0000ff0001000000, 0x0000ff0001000001, 0x0000ff0001000100, 0x0000ff000101ffff,
- 0x0000ff0001010000, 0x0000ff0001010101, 0x0000ff01ffffff00, 0x0000ff01ffff0001,
- 0x0000ff01ff00ff01, 0x0000ff01ff000000, 0x0000ff01ff000101, 0x0000ff01ff01ff00,
- 0x0000ff01ff0100ff, 0x0000ff0100ffff01, 0x0000ff0100ff0000, 0x0000ff0100ff0101,
- 0x0000ff010000ff00, 0x0000ff01000000ff, 0x0000ff0100000000, 0x0000ff0100000001,
- 0x0000ff0100000100, 0x0000ff010001ff01, 0x0000ff0100010000, 0x0000ff0101ff0000,
- 0x0000ff010100ffff, 0x0000ff010100ff01, 0x0000ff0101000000, 0x0000ff0101000100,
- 0x0000ff0101000101, 0x0000ff01010100ff, 0x000000ffffff00ff, 0x000000ffffff0000,
- 0x000000ffff00ff00, 0x000000ffff0000ff, 0x000000ffff000000, 0x000000ffff000001,
- 0x000000ffff0001ff, 0x000000ffff000100, 0x000000ffff01ff00, 0x000000ffff010000,
- 0x000000ffff0101ff, 0x000000ffff010101, 0x000000ff00ffff00, 0x000000ff00ff00ff,
- 0x000000ff00ff0000, 0x000000ff00ff0001, 0x000000ff00ff0100, 0x000000ff00ff0101,
- 0x000000ff0000ffff, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000,
- 0x000000ff00000001, 0x000000ff000001ff, 0x000000ff00000100, 0x000000ff00000101,
- 0x000000ff0001ff00, 0x000000ff0001ff01, 0x000000ff000100ff, 0x000000ff00010000,
- 0x000000ff00010001, 0x000000ff00010100, 0x000000ff01ffffff, 0x000000ff01ff01ff,
- 0x000000ff01ff0101, 0x000000ff0100ff00, 0x000000ff010000ff, 0x000000ff01000000,
- 0x000000ff01000001, 0x000000ff01000100, 0x000000ff0101ff00, 0x000000ff010100ff,
- 0x000000ff01010000, 0x000000ff01010101, 0x00000000ffffff00, 0x00000000ffffff01,
- 0x00000000ffff00ff, 0x00000000ffff0000, 0x00000000ffff0001, 0x00000000ffff0100,
- 0x00000000ff00ffff, 0x00000000ff00ff00, 0x00000000ff00ff01, 0x00000000ff0000ff,
- 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff000101,
- 0x00000000ff01ff00, 0x00000000ff0100ff, 0x00000000ff010000, 0x00000000ff010001,
- 0x00000000ff010100, 0x0000000000ffffff, 0x0000000000ffff00, 0x0000000000ffff01,
- 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, 0x0000000000ff01ff,
- 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01,
- 0x00000000000000ff, 0x0000000000000000, 0x0000000000000001, 0x00000000000001ff,
- 0x0000000000000100, 0x0000000000000101, 0x000000000001ffff, 0x000000000001ff00,
- 0x00000000000100ff, 0x0000000000010000, 0x0000000000010001, 0x00000000000101ff,
- 0x0000000000010100, 0x0000000000010101, 0x0000000001ffff00, 0x0000000001ff00ff,
- 0x0000000001ff0000, 0x0000000001ff0100, 0x0000000001ff0101, 0x000000000100ffff,
- 0x000000000100ff00, 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001,
- 0x00000000010001ff, 0x0000000001000100, 0x000000000101ff00, 0x00000000010100ff,
- 0x0000000001010000, 0x0000000001010001, 0x0000000001010100, 0x00000001ffffffff,
- 0x00000001ffffff00, 0x00000001ffffff01, 0x00000001ffff00ff, 0x00000001ffff0001,
- 0x00000001ffff01ff, 0x00000001ffff0100, 0x00000001ff00ff00, 0x00000001ff0000ff,
- 0x00000001ff000000, 0x00000001ff0001ff, 0x00000001ff000100, 0x00000001ff01ffff,
- 0x00000001ff01ff00, 0x00000001ff01ff01, 0x00000001ff0100ff, 0x00000001ff010000,
- 0x00000001ff010001, 0x00000001ff0101ff, 0x00000001ff010100, 0x0000000100ffff00,
- 0x0000000100ff0000, 0x0000000100ff0001, 0x0000000100ff01ff, 0x0000000100ff0100,
- 0x0000000100ff0101, 0x000000010000ffff, 0x000000010000ff00, 0x000000010000ff01,
- 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, 0x00000001000001ff,
- 0x0000000100000100, 0x0000000100000101, 0x000000010001ff00, 0x00000001000100ff,
- 0x0000000100010000, 0x0000000100010100, 0x0000000101ffff01, 0x0000000101ff0000,
- 0x0000000101ff0001, 0x0000000101ff01ff, 0x0000000101ff0100, 0x0000000101ff0101,
- 0x000000010100ff00, 0x0000000101000000, 0x0000000101000101, 0x000000010101ff01,
- 0x0000000101010000, 0x0000000101010001, 0x00000001010101ff, 0x0000000101010100,
- 0x000001ffffff00ff, 0x000001ffffff0000, 0x000001ffffff0001, 0x000001ffffff0100,
- 0x000001ffff00ffff, 0x000001ffff000000, 0x000001ffff0001ff, 0x000001ffff01ff00,
- 0x000001ffff010101, 0x000001ff00ff0000, 0x000001ff00ff01ff, 0x000001ff00ff0101,
- 0x000001ff0000ff00, 0x000001ff000000ff, 0x000001ff00000000, 0x000001ff00000001,
- 0x000001ff000001ff, 0x000001ff00000100, 0x000001ff0001ffff, 0x000001ff0001ff01,
- 0x000001ff000100ff, 0x000001ff00010000, 0x000001ff01ffff01, 0x000001ff01ff0100,
- 0x000001ff0100ffff, 0x000001ff0100ff01, 0x000001ff01000000, 0x000001ff010001ff,
- 0x000001ff0101ff00, 0x000001ff01010100, 0x00000100ffffff00, 0x00000100ffffff01,
- 0x00000100ffff0000, 0x00000100ffff0101, 0x00000100ff00ff00, 0x00000100ff0000ff,
- 0x00000100ff000000, 0x00000100ff000001, 0x00000100ff000100, 0x00000100ff010000,
- 0x0000010000ffff00, 0x0000010000ff00ff, 0x0000010000ff0000, 0x0000010000ff0001,
- 0x0000010000ff0100, 0x000001000000ffff, 0x000001000000ff00, 0x000001000000ff01,
- 0x00000100000000ff, 0x0000010000000000, 0x0000010000000001, 0x00000100000001ff,
- 0x0000010000000100, 0x0000010000000101, 0x000001000001ff00, 0x00000100000100ff,
- 0x0000010000010000, 0x0000010000010001, 0x0000010000010100, 0x0000010001ffff00,
- 0x0000010001ff0000, 0x0000010001ff0100, 0x000001000100ff00, 0x00000100010000ff,
- 0x0000010001000000, 0x0000010001000001, 0x00000100010001ff, 0x0000010001000100,
- 0x0000010001010000, 0x00000101ffff00ff, 0x00000101ffff01ff, 0x00000101ff000000,
- 0x00000101ff000101, 0x00000101ff01ffff, 0x00000101ff010000, 0x00000101ff010001,
- 0x00000101ff010100, 0x0000010100ff0000, 0x0000010100ff01ff, 0x0000010100ff0100,
- 0x000001010000ff00, 0x0000010100000000, 0x0000010100000001, 0x00000101000001ff,
- 0x0000010100000100, 0x000001010001ff01, 0x0000010100010000, 0x00000101000101ff,
- 0x0000010100010101, 0x0000010101ffff00, 0x0000010101ff0101, 0x000001010100ff01,
- 0x0000010101000000, 0x0000010101000001, 0x00000101010001ff, 0x0000010101000101,
- 0x000001010101ff00, 0x0001ffffffff0000, 0x0001ffffff0000ff, 0x0001ffffff000001,
- 0x0001ffffff000100, 0x0001ffffff010000, 0x0001ffff00ff00ff, 0x0001ffff0000ffff,
- 0x0001ffff00000000, 0x0001ffff00000001, 0x0001ffff000001ff, 0x0001ffff00000101,
- 0x0001ffff0001ff00, 0x0001ffff000100ff, 0x0001ffff00010001, 0x0001ffff00010100,
- 0x0001ffff01ffff00, 0x0001ffff01000001, 0x0001ffff01010000, 0x0001ff00ffffff00,
- 0x0001ff00ffff00ff, 0x0001ff00ffff0001, 0x0001ff00ffff0100, 0x0001ff00ff00ff01,
- 0x0001ff00ff000000, 0x0001ff00ff01ff00, 0x0001ff00ff01ff01, 0x0001ff00ff010001,
- 0x0001ff00ff010100, 0x0001ff0000ff0000, 0x0001ff0000ff0100, 0x0001ff000000ff00,
- 0x0001ff0000000000, 0x0001ff0000000001, 0x0001ff0000000100, 0x0001ff0000010000,
- 0x0001ff0000010001, 0x0001ff0000010101, 0x0001ff0001ff00ff, 0x0001ff0001ff0101,
- 0x0001ff000100ff01, 0x0001ff0001000000, 0x0001ff000101ff00, 0x0001ff0001010001,
- 0x0001ff0001010100, 0x0001ff01ff00ff00, 0x0001ff01ff000001, 0x0001ff01ff000100,
- 0x0001ff0100ffffff, 0x0001ff0100ffff00, 0x0001ff0100ff0001, 0x0001ff0100000000,
- 0x0001ff0100000001, 0x0001ff01000001ff, 0x0001ff010001ffff, 0x0001ff0101ff0000,
- 0x0001ff010100ff00, 0x0001ff0101000001, 0x0001ff0101010000, 0x000100ffff00ff00,
- 0x000100ffff00ff01, 0x000100ffff000000, 0x000100ffff000001, 0x000100ffff000101,
- 0x000100ffff01ff00, 0x000100ffff010001, 0x000100ffff010100, 0x000100ff00ffffff,
- 0x000100ff00ffff01, 0x000100ff00ff0000, 0x000100ff00ff01ff, 0x000100ff00ff0101,
- 0x000100ff0000ff00, 0x000100ff000000ff, 0x000100ff00000000, 0x000100ff00000001,
- 0x000100ff00000100, 0x000100ff00000101, 0x000100ff0001ffff, 0x000100ff0001ff01,
- 0x000100ff00010000, 0x000100ff01ff00ff, 0x000100ff01ff0000, 0x000100ff01ff0100,
- 0x000100ff0100ffff, 0x000100ff0100ff01, 0x000100ff010000ff, 0x000100ff01000000,
- 0x000100ff01000001, 0x000100ff010001ff, 0x000100ff01000101, 0x000100ff0101ff00,
- 0x000100ff010100ff, 0x000100ff01010100, 0x00010000ffff0000, 0x00010000ffff01ff,
- 0x00010000ffff0101, 0x00010000ff00ff00, 0x00010000ff000000, 0x00010000ff000001,
- 0x00010000ff000100, 0x0001000000ff00ff, 0x0001000000ff0000, 0x0001000000ff0001,
- 0x0001000000ff0100, 0x000100000000ffff, 0x000100000000ff00, 0x00010000000000ff,
- 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, 0x000100000001ff00,
- 0x00010000000100ff, 0x0001000000010000, 0x0001000000010001, 0x0001000000010100,
- 0x0001000001ff0001, 0x0001000001ff0100, 0x0001000001ff0101, 0x000100000100ff00,
- 0x0001000001000000, 0x0001000001000001, 0x0001000001000100, 0x0001000001000101,
- 0x000100000101ff01, 0x0001000001010000, 0x0001000001010001, 0x00010000010101ff,
- 0x00010001ffffff01, 0x00010001ffff0100, 0x00010001ff000000, 0x00010001ff01ffff,
- 0x00010001ff010001, 0x00010001ff0101ff, 0x00010001ff010100, 0x0001000100ffffff,
- 0x0001000100ff0000, 0x0001000100ff01ff, 0x0001000100ff0101, 0x000100010000ff00,
- 0x00010001000000ff, 0x0001000100000000, 0x0001000100000001, 0x00010001000001ff,
- 0x0001000100000101, 0x000100010001ffff, 0x0001000100010000, 0x00010001000101ff,
- 0x0001000101ffffff, 0x0001000101ffff01, 0x0001000101ff0000, 0x0001000101ff0101,
- 0x00010001010000ff, 0x0001000101000001, 0x00010001010001ff, 0x0001000101000100,
- 0x000100010101ffff, 0x00010001010100ff, 0x0001000101010001, 0x0001000101010101,
- 0x000101ffff000001, 0x000101ffff000100, 0x000101ffff010000, 0x000101ff00ffff00,
- 0x000101ff0000ff01, 0x000101ff00000000, 0x000101ff00000101, 0x000101ff0001ff00,
- 0x000101ff00010100, 0x000101ff01ff0000, 0x000101ff0100ff00, 0x000101ff010001ff,
- 0x000101ff01010001, 0x00010100ffffff00, 0x00010100ffff00ff, 0x00010100ff00ffff,
- 0x00010100ff000000, 0x00010100ff01ff00, 0x00010100ff0100ff, 0x00010100ff010001,
- 0x00010100ff010100, 0x0001010000ffffff, 0x0001010000ffff00, 0x0001010000ff0000,
- 0x0001010000ff0001, 0x0001010000ff01ff, 0x000101000000ff00, 0x00010100000000ff,
- 0x0001010000000000, 0x0001010000000001, 0x0001010000000100, 0x000101000001ffff,
- 0x0001010000010000, 0x0001010000010101, 0x0001010001ffff01, 0x0001010001ff00ff,
- 0x0001010001ff0101, 0x0001010001000000, 0x000101000101ff00, 0x00010100010100ff,
- 0x0001010001010000, 0x0001010001010100, 0x00010101ff00ff00, 0x00010101ff000001,
- 0x00010101ff0001ff, 0x0001010100ffff00, 0x0001010100ff00ff, 0x0001010100ff0100,
- 0x000101010000ffff, 0x0001010100000000, 0x00010101000001ff, 0x0001010100000101,
- 0x00010101000100ff, 0x0001010100010000, 0x0001010100010100, 0x0001010101ff0001,
- 0x00010101010000ff, 0x00010101010001ff, 0x0001010101000101, 0x0001010101010001,
- 0x01ffffffffffffff, 0x01ffffffffffff01, 0x01ffffffffff01ff, 0x01ffffffffff0101,
- 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, 0x01ffffffff010101,
- 0x01ffffff00ff0000, 0x01ffffff0000ffff, 0x01ffffff0000ff00, 0x01ffffff000000ff,
- 0x01ffffff00000001, 0x01ffffff00000100, 0x01ffffff00010000, 0x01ffffff01ffffff,
- 0x01ffffff01ffff01, 0x01ffffff01ff01ff, 0x01ffffff01ff0101, 0x01ffffff01000000,
- 0x01ffffff0101ffff, 0x01ffffff0101ff01, 0x01ffffff010101ff, 0x01ffffff01010101,
- 0x01ffff00ffff0000, 0x01ffff00ff00ff00, 0x01ffff00ff0000ff, 0x01ffff00ff000001,
- 0x01ffff00ff000100, 0x01ffff00ff010000, 0x01ffff0000ffff00, 0x01ffff0000ff00ff,
- 0x01ffff0000ff0100, 0x01ffff000000ffff, 0x01ffff000000ff01, 0x01ffff0000000000,
- 0x01ffff0000000001, 0x01ffff00000001ff, 0x01ffff0000000100, 0x01ffff00000100ff,
- 0x01ffff0000010001, 0x01ffff0000010100, 0x01ffff0001ff0000, 0x01ffff0001ff0100,
- 0x01ffff00010000ff, 0x01ffff0001000001, 0x01ffff0001000100, 0x01ffff0001010000,
- 0x01ffff01ffffffff, 0x01ffff01ffffff01, 0x01ffff01ffff01ff, 0x01ffff01ffff0101,
- 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff01ff01, 0x01ffff01ff0101ff,
- 0x01ffff01ff010101, 0x01ffff010000ff00, 0x01ffff01000000ff, 0x01ffff0100000100,
- 0x01ffff0100010000, 0x01ffff0101ffffff, 0x01ffff0101ffff01, 0x01ffff0101ff01ff,
- 0x01ffff0101ff0101, 0x01ffff0101000000, 0x01ffff010101ffff, 0x01ffff010101ff01,
- 0x01ffff01010101ff, 0x01ffff0101010101, 0x01ff00ffff0000ff, 0x01ff00ffff000100,
- 0x01ff00ff00ffff00, 0x01ff00ff00ff00ff, 0x01ff00ff0000ff00, 0x01ff00ff00000000,
- 0x01ff00ff00000101, 0x01ff00ff0001ff00, 0x01ff00ff000100ff, 0x01ff00ff00010100,
- 0x01ff00ff010000ff, 0x01ff00ff01000100, 0x01ff0000ffffff00, 0x01ff0000ffff0100,
- 0x01ff0000ff00ff01, 0x01ff0000ff000000, 0x01ff0000ff000101, 0x01ff0000ff010001,
- 0x01ff0000ff010100, 0x01ff000000ffffff, 0x01ff000000ffff00, 0x01ff000000ff0000,
- 0x01ff000000ff01ff, 0x01ff00000000ff00, 0x01ff0000000000ff, 0x01ff000000000000,
- 0x01ff000000000001, 0x01ff000000000100, 0x01ff000000000101, 0x01ff000000010000,
- 0x01ff000000010001, 0x01ff0000000101ff, 0x01ff000000010101, 0x01ff000001ffff00,
- 0x01ff000001ff00ff, 0x01ff000001ff0001, 0x01ff000001ff0100, 0x01ff00000100ffff,
- 0x01ff00000100ff01, 0x01ff000001000000, 0x01ff0000010001ff, 0x01ff000001010001,
- 0x01ff0001ff00ff00, 0x01ff0001ff000001, 0x01ff0001ff000100, 0x01ff0001ff010000,
- 0x01ff000100ffff00, 0x01ff000100ff00ff, 0x01ff000100ff0100, 0x01ff000100ff0101,
- 0x01ff00010000ffff, 0x01ff000100000000, 0x01ff000100000100, 0x01ff000100000101,
- 0x01ff00010001ff00, 0x01ff000100010001, 0x01ff000100010101, 0x01ff000101ff0000,
- 0x01ff00010100ff00, 0x01ff000101000101, 0x01ff0001010100ff, 0x01ff01ffffffffff,
- 0x01ff01ffffffff01, 0x01ff01ffffff01ff, 0x01ff01ffffff0101, 0x01ff01ffff000000,
- 0x01ff01ffff01ffff, 0x01ff01ffff01ff01, 0x01ff01ffff0101ff, 0x01ff01ffff010101,
- 0x01ff01ff00ffff00, 0x01ff01ff00ff0000, 0x01ff01ff0000ff00, 0x01ff01ff000000ff,
- 0x01ff01ff00000100, 0x01ff01ff00010000, 0x01ff01ff00010100, 0x01ff01ff01ffffff,
- 0x01ff01ff01ffff01, 0x01ff01ff01ff01ff, 0x01ff01ff01ff0101, 0x01ff01ff01000000,
- 0x01ff01ff0101ffff, 0x01ff01ff0101ff01, 0x01ff01ff010101ff, 0x01ff01ff01010101,
- 0x01ff0100ffff0000, 0x01ff0100ffff0001, 0x01ff0100ff00ff00, 0x01ff0100ff0000ff,
- 0x01ff0100ff000001, 0x01ff0100ff010000, 0x01ff010000ffff00, 0x01ff010000ff00ff,
- 0x01ff010000ff0001, 0x01ff010000ff0100, 0x01ff01000000ffff, 0x01ff01000000ff01,
- 0x01ff010000000000, 0x01ff010000000101, 0x01ff01000001ff00, 0x01ff0100000100ff,
- 0x01ff010001ff0000, 0x01ff010001000001, 0x01ff010001000100, 0x01ff010001010000,
- 0x01ff0101ffffffff, 0x01ff0101ffffff01, 0x01ff0101ffff01ff, 0x01ff0101ffff0101,
- 0x01ff0101ff000000, 0x01ff0101ff01ffff, 0x01ff0101ff01ff01, 0x01ff0101ff0101ff,
- 0x01ff0101ff010101, 0x01ff010100ff0000, 0x01ff01010000ff00, 0x01ff0101000000ff,
- 0x01ff010100000001, 0x01ff010101ffffff, 0x01ff010101ffff01, 0x01ff010101ff01ff,
- 0x01ff010101ff0101, 0x01ff010101000000, 0x01ff01010101ffff, 0x01ff01010101ff01,
- 0x01ff0101010101ff, 0x01ff010101010101, 0x0100ffffffff0000, 0x0100ffffff00ff00,
- 0x0100ffffff000001, 0x0100ffffff0001ff, 0x0100ffffff000100, 0x0100ffffff010000,
- 0x0100ffff00ffff00, 0x0100ffff00ff0001, 0x0100ffff00ff0100, 0x0100ffff00000000,
- 0x0100ffff000001ff, 0x0100ffff00000101, 0x0100ffff00010100, 0x0100ffff00010101,
- 0x0100ffff01ff0000, 0x0100ffff0100ff00, 0x0100ffff010000ff, 0x0100ffff01000001,
- 0x0100ffff01000100, 0x0100ffff01010000, 0x0100ff00ffffff00, 0x0100ff00ffff00ff,
- 0x0100ff00ffff0001, 0x0100ff00ffff0100, 0x0100ff00ff00ffff, 0x0100ff00ff000000,
- 0x0100ff00ff0001ff, 0x0100ff00ff000101, 0x0100ff00ff01ff00, 0x0100ff00ff0100ff,
- 0x0100ff00ff010001, 0x0100ff00ff010100, 0x0100ff0000ffffff, 0x0100ff0000ff0000,
- 0x0100ff000000ffff, 0x0100ff000000ff00, 0x0100ff00000000ff, 0x0100ff0000000000,
- 0x0100ff0000000001, 0x0100ff0000000100, 0x0100ff000001ff01, 0x0100ff0000010000,
- 0x0100ff0001ff00ff, 0x0100ff0001ff0001, 0x0100ff000100ff01, 0x0100ff0001000000,
- 0x0100ff00010001ff, 0x0100ff000101ff00, 0x0100ff00010100ff, 0x0100ff0001010001,
- 0x0100ff0001010100, 0x0100ff01ffff0000, 0x0100ff01ff00ff00, 0x0100ff01ff0000ff,
- 0x0100ff01ff000100, 0x0100ff01ff010000, 0x0100ff0100ff00ff, 0x0100ff0100ff0001,
- 0x0100ff0100ff0100, 0x0100ff010000ffff, 0x0100ff010000ff01, 0x0100ff0100000000,
- 0x0100ff01000001ff, 0x0100ff0100010001, 0x0100ff0100010100, 0x0100ff0101ff0000,
- 0x0100ff01010000ff, 0x0100ff0101000001, 0x0100ff0101010100, 0x010000ffffffff00,
- 0x010000ffffff00ff, 0x010000ffffff0001, 0x010000ffff00ffff, 0x010000ffff000000,
- 0x010000ffff0001ff, 0x010000ffff010001, 0x010000ff00ffffff, 0x010000ff00ff0101,
- 0x010000ff0000ff00, 0x010000ff000000ff, 0x010000ff00000000, 0x010000ff00000001,
- 0x010000ff000001ff, 0x010000ff00000100, 0x010000ff0001ffff, 0x010000ff0001ff00,
- 0x010000ff0001ff01, 0x010000ff00010000, 0x010000ff01ff00ff, 0x010000ff01ff0001,
- 0x010000ff0100ff01, 0x010000ff010000ff, 0x010000ff01000000, 0x010000ff010001ff,
- 0x010000ff0101ff00, 0x010000ff01010100, 0x01000000ffffffff, 0x01000000ffff0000,
- 0x01000000ffff01ff, 0x01000000ffff0101, 0x01000000ff00ffff, 0x01000000ff00ff00,
- 0x01000000ff0000ff, 0x01000000ff000000, 0x01000000ff000001, 0x01000000ff000100,
- 0x01000000ff01ff00, 0x01000000ff010000, 0x01000000ff010100, 0x01000000ff010101,
- 0x0100000000ffff00, 0x0100000000ff00ff, 0x0100000000ff0000, 0x0100000000ff0001,
- 0x0100000000ff0100, 0x010000000000ffff, 0x010000000000ff00, 0x010000000000ff01,
- 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, 0x01000000000001ff,
- 0x0100000000000100, 0x0100000000000101, 0x010000000001ff00, 0x01000000000100ff,
- 0x0100000000010000, 0x0100000000010001, 0x0100000000010100, 0x0100000001ffff00,
- 0x0100000001ff0000, 0x0100000001ff01ff, 0x010000000100ff00, 0x010000000100ff01,
- 0x01000000010000ff, 0x0100000001000000, 0x0100000001000001, 0x0100000001000100,
- 0x0100000001000101, 0x010000000101ffff, 0x010000000101ff01, 0x0100000001010000,
- 0x01000000010101ff, 0x0100000001010101, 0x01000001ffffff00, 0x01000001ffff00ff,
- 0x01000001ff00ffff, 0x01000001ff000000, 0x01000001ff000100, 0x01000001ff01ffff,
- 0x01000001ff010001, 0x01000001ff010100, 0x0100000100ff0000, 0x0100000100ff01ff,
- 0x0100000100ff0100, 0x010000010000ff00, 0x010000010000ff01, 0x0100000100000000,
- 0x0100000100000001, 0x0100000100000100, 0x0100000100010000, 0x01000001000101ff,
- 0x0100000101ffff01, 0x0100000101ff00ff, 0x0100000101ff0100, 0x0100000101ff0101,
- 0x010000010100ff01, 0x01000001010000ff, 0x0100000101000000, 0x01000001010100ff,
- 0x0100000101010001, 0x0100000101010100, 0x010001ffffff0000, 0x010001ffff000001,
- 0x010001ffff000100, 0x010001ffff010000, 0x010001ff00ffff00, 0x010001ff00ff0001,
- 0x010001ff0000ffff, 0x010001ff0000ff01, 0x010001ff00000000, 0x010001ff00000001,
- 0x010001ff00000101, 0x010001ff000100ff, 0x010001ff00010000, 0x010001ff01ff0000,
- 0x010001ff0100ff00, 0x010001ff01000001, 0x010001ff01000100, 0x010001ff01010000,
- 0x01000100ffff00ff, 0x01000100ffff0001, 0x01000100ffff0100, 0x01000100ff00ffff,
- 0x01000100ff00ff01, 0x01000100ff000000, 0x01000100ff0001ff, 0x01000100ff000101,
- 0x01000100ff01ffff, 0x01000100ff01ff00, 0x01000100ff0100ff, 0x01000100ff010001,
- 0x0100010000ffffff, 0x0100010000ffff01, 0x0100010000ff0000, 0x0100010000ff01ff,
- 0x0100010000ff0101, 0x010001000000ff00, 0x01000100000000ff, 0x0100010000000000,
- 0x0100010000000001, 0x0100010000000100, 0x010001000001ff01, 0x0100010000010000,
- 0x0100010000010001, 0x0100010000010101, 0x0100010001ffff00, 0x0100010001ff00ff,
- 0x010001000100ffff, 0x010001000100ff01, 0x0100010001000000, 0x0100010001000101,
- 0x010001000101ff00, 0x0100010001010001, 0x01000101ffff0000, 0x01000101ff000000,
- 0x01000101ff010000, 0x0100010100ff00ff, 0x0100010100ff0001, 0x0100010100ff0100,
- 0x010001010000ffff, 0x0100010100000000, 0x01000101000001ff, 0x010001010001ff00,
- 0x0100010101ff0000, 0x010001010100ff00, 0x01000101010000ff, 0x0100010101000000,
- 0x0100010101000001, 0x0101ffffffffffff, 0x0101ffffffffff01, 0x0101ffffffff01ff,
- 0x0101ffffffff0101, 0x0101ffffff000000, 0x0101ffffff01ffff, 0x0101ffffff01ff01,
- 0x0101ffffff0101ff, 0x0101ffffff010101, 0x0101ffff00ff0000, 0x0101ffff0000ff00,
- 0x0101ffff000000ff, 0x0101ffff00000001, 0x0101ffff00000100, 0x0101ffff01ffffff,
- 0x0101ffff01ffff01, 0x0101ffff01ff01ff, 0x0101ffff01ff0101, 0x0101ffff01000000,
- 0x0101ffff0101ffff, 0x0101ffff0101ff01, 0x0101ffff010101ff, 0x0101ffff01010101,
- 0x0101ff00ffff0000, 0x0101ff00ffff0100, 0x0101ff00ff00ff00, 0x0101ff00ff0000ff,
- 0x0101ff00ff000001, 0x0101ff00ff000100, 0x0101ff00ff000101, 0x0101ff0000ff0001,
- 0x0101ff0000ff0100, 0x0101ff000000ff00, 0x0101ff0000000000, 0x0101ff00000001ff,
- 0x0101ff0000000101, 0x0101ff000001ff00, 0x0101ff00000100ff, 0x0101ff0001ff0000,
- 0x0101ff000100ffff, 0x0101ff000100ff01, 0x0101ff0001000001, 0x0101ff0001000100,
- 0x0101ff01ffffff01, 0x0101ff01ffff01ff, 0x0101ff01ffff0101, 0x0101ff01ff00ffff,
- 0x0101ff01ff000100, 0x0101ff01ff01ff01, 0x0101ff01ff0101ff, 0x0101ff01ff010101,
- 0x0101ff0100ff0000, 0x0101ff010000ff00, 0x0101ff0100000001, 0x0101ff0100000100,
- 0x0101ff0100010000, 0x0101ff0101ffffff, 0x0101ff0101ffff01, 0x0101ff0101ff01ff,
- 0x0101ff0101ff0101, 0x0101ff0101000000, 0x0101ff010101ffff, 0x0101ff010101ff01,
- 0x0101ff01010101ff, 0x0101ff0101010101, 0x010100ffff000100, 0x010100ffff010000,
- 0x010100ff00ffff00, 0x010100ff00ff00ff, 0x010100ff0000ffff, 0x010100ff000000ff,
- 0x010100ff00000000, 0x010100ff000001ff, 0x010100ff00000101, 0x010100ff0001ff00,
- 0x010100ff00010000, 0x010100ff00010001, 0x010100ff000101ff, 0x010100ff00010100,
- 0x010100ff01ff0000, 0x01010000ffff0001, 0x01010000ffff0100, 0x01010000ff00ffff,
- 0x01010000ff00ff01, 0x01010000ff000000, 0x01010000ff0001ff, 0x01010000ff010001,
- 0x01010000ff010100, 0x0101000000ffff01, 0x0101000000ff0000, 0x010100000000ff00,
- 0x01010000000000ff, 0x0101000000000000, 0x0101000000000001, 0x0101000000000100,
- 0x0101000000010000, 0x0101000000010101, 0x0101000001ffff00, 0x0101000001ff00ff,
- 0x0101000001ff0000, 0x0101000001ff0001, 0x0101000001ff0100, 0x010100000100ff01,
- 0x0101000001000000, 0x01010000010001ff, 0x01010001ffff0000, 0x01010001ff00ff00,
- 0x01010001ff000001, 0x01010001ff000101, 0x01010001ff01ff00, 0x01010001ff010000,
- 0x0101000100ff00ff, 0x0101000100ff0001, 0x0101000100ff0101, 0x010100010000ff01,
- 0x0101000100000000, 0x0101000100000001, 0x01010001000001ff, 0x010100010001ffff,
- 0x010100010001ff01, 0x0101000101ff0001, 0x010100010100ffff, 0x0101000101000000,
- 0x0101000101000001, 0x0101000101000100, 0x010100010101ff00, 0x01010001010100ff,
- 0x0101000101010001, 0x010101ffffffffff, 0x010101ffffffff01, 0x010101ffffff01ff,
- 0x010101ffffff0101, 0x010101ffff01ffff, 0x010101ffff01ff01, 0x010101ffff0101ff,
- 0x010101ffff010101, 0x010101ff0000ff00, 0x010101ff000000ff, 0x010101ff00000001,
- 0x010101ff00000100, 0x010101ff01ffffff, 0x010101ff01ffff01, 0x010101ff01ff01ff,
- 0x010101ff01ff0101, 0x010101ff01000000, 0x010101ff0101ffff, 0x010101ff0101ff01,
- 0x010101ff010101ff, 0x010101ff01010101, 0x01010100ffff0000, 0x01010100ff0000ff,
- 0x01010100ff000100, 0x01010100ff01ff00, 0x01010100ff010000, 0x0101010000ffff00,
- 0x010101000000ffff, 0x0101010000000000, 0x0101010000000101, 0x010101000001ff00,
- 0x0101010000010001, 0x0101010000010100, 0x010101000100ffff, 0x0101010001000001,
- 0x01010101ffffffff, 0x01010101ffffff01, 0x01010101ffff01ff, 0x01010101ffff0101,
- 0x01010101ff01ffff, 0x01010101ff01ff01, 0x01010101ff0101ff, 0x01010101ff010101,
- 0x010101010000ff00, 0x01010101000000ff, 0x0101010100000001, 0x0101010101ffffff,
- 0x0101010101ffff01, 0x0101010101ff01ff, 0x0101010101ff0101, 0x0101010101000000,
- 0x010101010101ffff, 0x010101010101ff01, 0x01010101010101ff, 0x0101010101010101,
-GGML_TABLE_END()
-#else
-GGML_TABLE_BEGIN(uint32_t, iq1s_grid_gpu, NGRID_IQ1S)
- 0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000,
- 0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101,
- 0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200,
- 0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212,
- 0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011,
- 0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111,
- 0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220,
- 0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022,
- 0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220,
- 0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101,
- 0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110,
- 0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111,
- 0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010,
- 0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210,
- 0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221,
- 0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021,
- 0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002,
- 0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101,
- 0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101,
- 0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211,
- 0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110,
- 0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022,
- 0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121,
- 0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220,
- 0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001,
- 0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101,
- 0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102,
- 0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012,
- 0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010,
- 0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111,
- 0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122,
- 0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222,
- 0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001,
- 0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102,
- 0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101,
- 0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000,
- 0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101,
- 0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112,
- 0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110,
- 0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211,
- 0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012,
- 0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111,
- 0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120,
- 0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122,
- 0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121,
- 0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221,
- 0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001,
- 0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101,
- 0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101,
- 0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011,
- 0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111,
- 0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011,
- 0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122,
- 0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121,
- 0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222,
- 0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101,
- 0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000,
- 0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200,
- 0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110,
- 0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112,
- 0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222,
- 0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021,
- 0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121,
- 0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201,
- 0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200,
- 0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101,
- 0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011,
- 0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010,
- 0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211,
- 0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121,
- 0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000,
- 0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202,
- 0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202,
- 0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211,
- 0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112,
- 0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020,
- 0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121,
- 0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222,
- 0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102,
- 0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100,
- 0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110,
- 0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011,
- 0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111,
- 0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110,
- 0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121,
- 0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222,
- 0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201,
- 0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102,
- 0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201,
- 0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012,
- 0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010,
- 0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010,
- 0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110,
- 0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011,
- 0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212,
- 0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021,
- 0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021,
- 0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021,
- 0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101,
- 0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101,
- 0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100,
- 0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010,
- 0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111,
- 0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010,
- 0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111,
- 0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120,
- 0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120,
- 0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101,
- 0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001,
- 0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201,
- 0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210,
- 0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211,
- 0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111,
- 0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112,
- 0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211,
- 0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010,
- 0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021,
- 0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122,
- 0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221,
- 0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102,
- 0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100,
- 0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101,
- 0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101,
- 0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101,
- 0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012,
- 0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110,
- 0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112,
- 0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210,
- 0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210,
- 0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210,
- 0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010,
- 0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110,
- 0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122,
- 0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020,
- 0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021,
- 0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022,
- 0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120,
- 0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222,
- 0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221,
- 0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001,
- 0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102,
- 0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201,
- 0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012,
- 0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111,
- 0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012,
- 0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110,
- 0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110,
- 0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121,
- 0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221,
- 0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220,
- 0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222,
- 0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000,
- 0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201,
- 0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012,
- 0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011,
- 0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212,
- 0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221,
- 0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121,
- 0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202,
- 0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202,
- 0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002,
- 0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101,
- 0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210,
- 0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112,
- 0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011,
- 0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011,
- 0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210,
- 0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020,
- 0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220,
- 0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222,
- 0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222,
- 0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001,
- 0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010,
- 0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111,
- 0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010,
- 0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110,
- 0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221,
- 0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122,
- 0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202,
- 0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100,
- 0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101,
- 0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112,
- 0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111,
- 0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211,
- 0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222,
- 0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221,
- 0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022,
- 0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101,
- 0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211,
- 0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111,
- 0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111,
- 0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010,
- 0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121,
- 0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222,
- 0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000,
- 0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202,
- 0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000,
- 0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202,
- 0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110,
- 0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110,
- 0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222,
- 0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120,
- 0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022,
- 0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101,
- 0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202,
- 0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110,
- 0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110,
- 0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111,
- 0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111,
- 0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120,
- 0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121,
- 0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001,
- 0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202,
- 0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001,
- 0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200,
- 0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011,
- 0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212,
- 0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012,
- 0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110,
- 0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012,
- 0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111,
- 0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020,
- 0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121,
- 0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222,
- 0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102,
- 0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102,
- 0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101,
- 0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212,
- 0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210,
- 0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111,
- 0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212,
- 0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221,
- 0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121,
- 0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002,
- 0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000,
- 0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202,
- 0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112,
- 0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111,
- 0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020,
- 0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221,
- 0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022,
- 0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100,
- 0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201,
- 0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112,
- 0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211,
- 0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012,
- 0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121,
- 0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020,
- 0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120,
- 0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200,
- 0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200,
- 0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110,
- 0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011,
- 0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222,
- 0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020,
- 0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222,
-GGML_TABLE_END()
-#endif
-
-#endif // GGML_COMMON_IMPL
-#endif // GGML_COMMON_IMPL
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
deleted file mode 100644
index d277104d121..00000000000
--- a/ggml-cuda.cu
+++ /dev/null
@@ -1,2756 +0,0 @@
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
-
-#include "ggml-cuda/common.cuh"
-#include "ggml-cuda/acc.cuh"
-#include "ggml-cuda/alibi.cuh"
-#include "ggml-cuda/arange.cuh"
-#include "ggml-cuda/argsort.cuh"
-#include "ggml-cuda/binbcast.cuh"
-#include "ggml-cuda/clamp.cuh"
-#include "ggml-cuda/concat.cuh"
-#include "ggml-cuda/convert.cuh"
-#include "ggml-cuda/cpy.cuh"
-#include "ggml-cuda/diagmask.cuh"
-#include "ggml-cuda/dmmv.cuh"
-#include "ggml-cuda/getrows.cuh"
-#include "ggml-cuda/im2col.cuh"
-#include "ggml-cuda/mmq.cuh"
-#include "ggml-cuda/mmvq.cuh"
-#include "ggml-cuda/norm.cuh"
-#include "ggml-cuda/pad.cuh"
-#include "ggml-cuda/pool2d.cuh"
-#include "ggml-cuda/quantize.cuh"
-#include "ggml-cuda/rope.cuh"
-#include "ggml-cuda/scale.cuh"
-#include "ggml-cuda/softmax.cuh"
-#include "ggml-cuda/sumrows.cuh"
-#include "ggml-cuda/tsembd.cuh"
-#include "ggml-cuda/unary.cuh"
-#include "ggml-cuda/upscale.cuh"
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include