diff --git a/010P00002405F02D94-1.jpg b/010P00002405F02D94-1.jpg new file mode 100644 index 00000000..8097d8f4 Binary files /dev/null and b/010P00002405F02D94-1.jpg differ diff --git a/include/infinicore_infer.h b/include/infinicore_infer.h index 0bed7bc7..5b2ceb99 100644 --- a/include/infinicore_infer.h +++ b/include/infinicore_infer.h @@ -4,7 +4,11 @@ #include "infinicore_infer/cache.h" #include "infinicore_infer/weights_loader.h" + #include "infinicore_infer/models/deepseek.h" #include "infinicore_infer/models/jiuge.h" +#include "infinicore_infer/models/jiuge_awq.h" +#include "infinicore_infer/models/qwen3vl.h" + #endif /* INFINICORE_INFER_H */ diff --git a/include/infinicore_infer/models/qwen3vl.h b/include/infinicore_infer/models/qwen3vl.h new file mode 100644 index 00000000..ee3d59a2 --- /dev/null +++ b/include/infinicore_infer/models/qwen3vl.h @@ -0,0 +1,203 @@ +#ifndef QWEN3VL_WEIGHTS_H +#define QWEN3VL_WEIGHTS_H + +#include +#include +#include + +#include +#include + +struct Qwen3vlWeights; + +// Function pointer signatures +typedef void (*qwen3vl_load_global_fn)(Qwen3vlWeights *, void *cpu_ptr); +typedef void (*qwen3vl_load_layer_fn)(Qwen3vlWeights *, void *cpu_ptr, size_t layer_id); +// Struct containing all weight loading functions +typedef struct { + // Global + qwen3vl_load_global_fn load_input_embd; + qwen3vl_load_global_fn load_output_norm; + qwen3vl_load_global_fn load_output_embd; + + // Attention + qwen3vl_load_layer_fn load_attn_norm; + qwen3vl_load_layer_fn load_attn_q_norm; + qwen3vl_load_layer_fn load_attn_k_norm; + qwen3vl_load_layer_fn load_attn_qkv_proj; + qwen3vl_load_layer_fn load_attn_o_proj; + + // MLP + qwen3vl_load_layer_fn load_mlp_norm; + qwen3vl_load_layer_fn load_mlp_gate_up; + qwen3vl_load_layer_fn load_mlp_down; + +} Qwen3vlLangWeightLoader; + +typedef struct { + // Patch_embed + qwen3vl_load_global_fn load_patch_embed_weight; + qwen3vl_load_global_fn load_patch_embed_bias; + qwen3vl_load_global_fn load_pos_embed_weight; + + // blocks attn + qwen3vl_load_layer_fn load_attn_proj_weight; + qwen3vl_load_layer_fn load_attn_proj_bias; + qwen3vl_load_layer_fn load_attn_qkv_weight; + qwen3vl_load_layer_fn load_attn_qkv_bias; + + //block mlp + qwen3vl_load_layer_fn load_mlp_linear_fc1_weight; + qwen3vl_load_layer_fn load_mlp_linear_fc1_bias; + qwen3vl_load_layer_fn load_mlp_linear_fc2_weight; + qwen3vl_load_layer_fn load_mlp_linear_fc2_bias; + + //block norm + qwen3vl_load_layer_fn load_norm1_weight; + qwen3vl_load_layer_fn load_norm1_bias; + qwen3vl_load_layer_fn load_norm2_weight; + qwen3vl_load_layer_fn load_norm2_bias; + + //deepstack_merger + qwen3vl_load_layer_fn load_deepstack_merger_linear_fc1_weight; + qwen3vl_load_layer_fn load_deepstack_merger_linear_fc1_bias; + qwen3vl_load_layer_fn load_deepstack_merger_linear_fc2_weight; + qwen3vl_load_layer_fn load_deepstack_merger_linear_fc2_bias; + qwen3vl_load_layer_fn load_deepstack_merger_norm_weight; + qwen3vl_load_layer_fn load_deepstack_merger_norm_bias; + + //merger + qwen3vl_load_global_fn load_merger_linear_fc1_weight; + qwen3vl_load_global_fn load_merger_linear_fc1_bias; + qwen3vl_load_global_fn load_merger_linear_fc2_weight; + qwen3vl_load_global_fn load_merger_linear_fc2_bias; + qwen3vl_load_global_fn load_merger_norm_weight; + qwen3vl_load_global_fn load_merger_norm_bias; + +} Qwen3vlVisWeightLoader; + +typedef struct { + Qwen3vlLangWeightLoader lang_loader; + Qwen3vlVisWeightLoader vis_loader; +} Qwen3vlWeightLoader; + +struct Qwen3vlModel; + +typedef struct { + size_t bos_token_id; + size_t eos_token_id; + size_t head_dim; + size_t hidden_size; + float initializer_range; + size_t intermediate_size; + size_t max_tokens; + size_t num_attention_heads; + size_t num_hidden_layers; + size_t num_key_value_heads; + float rms_norm_eps; + size_t mrope_section[3]; + size_t rope_theta; + size_t vocab_size; +} Qwen3vlTextMeta; + +typedef struct { + size_t depth; + size_t deepstack_visual_indexes[3]; + size_t hidden_size; + size_t in_channels; + float initializer_range; + size_t intermediate_size; + size_t num_heads; + size_t num_position_embeddings; + size_t out_hidden_size; + size_t patch_size; + size_t spatial_merge_size; + size_t temporal_patch_size; +} Qwen3vlVisMeta; + +typedef struct { + infiniDtype_t dtype; //INFINI_DTYPE_BF16 + + Qwen3vlTextMeta text_meta; + Qwen3vlVisMeta vis_meta; + + size_t image_token_id; + size_t video_token_id; + size_t vision_end_token_id; + size_t vision_start_token_id; +} Qwen3vlMeta; + +//////////////////// APIs /////////////////////// +/// @brief 创建模型 +/// @param device 协处理器种类 +/// @param ndev 协处理器数量 +/// @param dev_ids 协处理器编号,长度为 ndev +__C __export struct Qwen3vlModel * +createQwen3vlModel(const Qwen3vlMeta *, + const Qwen3vlWeights *); + +__C Qwen3vlWeights * +createQwen3vlWeights(const Qwen3vlMeta *meta, + infiniDevice_t device, + int ndev, + const int *dev_ids, + bool transpose_weight); + +__C __export Qwen3vlWeightLoader * +createQwen3vlWeightLoader(); + +/// @brief 销毁模型 +__C __export void destroyQwen3vlModel(struct Qwen3vlModel *); + +__C __export struct Qwen3vlCache * +createQwen3vlCache(const struct Qwen3vlModel *); + +__C __export void +dropQwen3vlCache(const struct Qwen3vlModel *, + struct Qwen3vlCache *); + +/// @brief 批次推理一轮,并采样出新的 token +/// @param tokens 输入 token 地址 +/// @param ntok 输入 token 数量 +/// @param nreq 请求数量 +/// @param req_lens 每个请求的 token 数量 +/// @param req_pos 每个请求的起始位置 +/// @param kv_caches 每个请求的 KV Cache +/// @param temperature 采样温度(0. 表示贪心采样) +/// @param topk 采样 topk(1 表示贪心采样) +/// @param topp 采样 topp +/// @param output 输出 token 数组,每个请求一个输出,长度至少为nreq +__C __export void +inferBatchQwen3vl(struct Qwen3vlModel *, + const uint32_t *tokens, uint32_t ntok, + void *pixel_values, uint32_t total_patches, + uint32_t *image_grid_thw, uint32_t num_images, + void *pixel_values_videos, uint32_t total_patches_videos, + uint32_t *video_grid_thw, uint32_t num_videos, + uint32_t patch_features, + const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos, + struct Qwen3vlCache **caches, + const float *temperature, const uint32_t *topk, const float *topp, + uint32_t *output); + +/// @brief 批次推理一轮,输出 output embedding 后的 logits +/// @param tokens 输入 token 地址 +/// @param ntok 输入 token 数量 +/// @param nreq 请求数量 +/// @param req_lens 每个请求的 token 数量 +/// @param req_pos 每个请求的起始位置 +/// @param kv_caches 每个请求的 KV Cache +/// @param logits 输出 token 数组,每个请求一个输出,长度至少为nreq +__C __export void +forwardBatchQwen3vl(struct Qwen3vlModel *, + const uint32_t *tokens, uint32_t ntok, + void *pixel_values, uint32_t total_patches, + uint32_t *image_grid_thw, uint32_t num_images, + void *pixel_values_videos, uint32_t total_patches_videos, + uint32_t *video_grid_thw, uint32_t num_videos, + uint32_t patch_features, + const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos, + struct Qwen3vlCache **caches, + void *logits); + +#endif // QWEN3VL_WEIGHTS_H diff --git a/qwen3vl_test.sh b/qwen3vl_test.sh new file mode 100755 index 00000000..c83e47aa --- /dev/null +++ b/qwen3vl_test.sh @@ -0,0 +1,27 @@ +#!/bin/bash +#SBATCH --job-name=test_job # 任务名 +#SBATCH --output=output_%j.log # 标准输出文件(%j 会替换成 job ID) +#SBATCH --error=error_%j.log # 标准错误输出文件 +#SBATCH --partition=nvidia # 分区名(机器系统默认分区是 nvidia) +#SBATCH --nodes=1 # 需要的节点数 +#SBATCH --ntasks=1 # 总任务数(通常 = 节点数 × 每节点任务数) +#SBATCH --cpus-per-task=8 # 每个任务需要的 CPU 核心数 +#SBATCH --gres=gpu:nvidia:4 # 请求 4 块 GPU(nvidia 是 Gres 类型) +#SBATCH --mem=32G # 请求的内存 + +# 需要用到计算资源的命令 +# 推荐使用 srun 启动主程序,自动绑定资源 +source /data/apps/env.sh +source /data/apps/miniforge3/etc/profile.d/conda.sh +conda activate py313 +export INFINI_ROOT=$HOME/.infini +export LD_LIBRARY_PATH=$INFINI_ROOT/lib:$LD_LIBRARY_PATH +export PATH="/data/apps/xmake/bin:/usr/local/cuda/bin:$PATH" + +export PYTHONPATH=$HOME/InfiniLM/scripts:$PYTHONPATH + +cd $HOME/InfiniLM + +#srun python scripts/qwen3vl_test.py +#srun python scripts/qwen3vl.py --nvidia /data/shared/models/Qwen3-VL-2B-Instruct +srun python scripts/launch_server.py --model-path /data/shared/models/Qwen3-VL-2B-Instruct --dev nvidia --ndev 4 \ No newline at end of file diff --git a/scripts/infer_task.py b/scripts/infer_task.py index 0d1231b7..aca61285 100644 --- a/scripts/infer_task.py +++ b/scripts/infer_task.py @@ -1,8 +1,9 @@ class InferTask: - def __init__(self, id, tokens, max_tokens, temperature, topk, topp, end_tokens): + def __init__(self, id, inputs, max_tokens, temperature, topk, topp, end_tokens): self.id = id self.finish_reason = None - self.tokens = tokens + self.inputs = inputs + self.tokens = inputs['input_ids'][0].tolist() self.max_tokens = max_tokens self.temperature = temperature self.topk = topk diff --git a/scripts/launch_server.py b/scripts/launch_server.py index 2d231b49..d5f8a18a 100644 --- a/scripts/launch_server.py +++ b/scripts/launch_server.py @@ -1,5 +1,4 @@ -from jiuge import JiugeForCauslLM -from jiuge_awq import JiugeAWQForCausalLM +from qwen3vl import Qwen3vlForCauslLM from libinfinicore_infer import DeviceType from infer_task import InferTask from kvcache_pool import KVCachePool @@ -60,14 +59,9 @@ def parse_args(): "--max-tokens", type=int, required=False, - default=None, + default=200, help="Max token sequence length that model will handle (follows model config if not provided)", ) - parser.add_argument( - "--awq", - action="store_true", - help="Whether to use AWQ quantized model (default: False)", - ) return parser.parse_args() @@ -76,7 +70,6 @@ def parse_args(): model_path = args.model_path ndev = args.ndev max_tokens = args.max_tokens -USE_AWQ = args.awq MAX_BATCH = args.max_batch print( f"Using MAX_BATCH={MAX_BATCH}. Try reduce this value if out of memory error occurs." @@ -93,7 +86,7 @@ def chunk_json(id_, content=None, role=None, finish_reason=None): "id": id_, "object": "chat.completion.chunk", "created": int(time.time()), - "model": "jiuge", + "model": "qwen3vl", "system_fingerprint": None, "choices": [ { @@ -109,8 +102,8 @@ def chunk_json(id_, content=None, role=None, finish_reason=None): # A wrapper for InferTask that supports async output queue class AsyncInferTask(InferTask): - def __init__(self, id, tokens, max_tokens, temperature, topk, topp, end_tokens): - super().__init__(id, tokens, max_tokens, temperature, topk, topp, end_tokens) + def __init__(self, id, inputs, max_tokens, temperature, topk, topp, end_tokens): + super().__init__(id, inputs, max_tokens, temperature, topk, topp, end_tokens) self.output_queue = janus.Queue() print(f"[INFO] Create InferTask {self.id}") @@ -122,14 +115,9 @@ def output(self, out_token): @contextlib.asynccontextmanager async def lifespan(app: FastAPI): # Startup - if USE_AWQ: - app.state.model = JiugeAWQForCausalLM( - model_path, device_type, ndev, max_tokens=max_tokens - ) - else: - app.state.model = JiugeForCauslLM( - model_path, device_type, ndev, max_tokens=max_tokens - ) + app.state.model = Qwen3vlForCauslLM( + model_path, device_type, ndev, max_tokens=max_tokens + ) app.state.kv_cache_pool = KVCachePool(app.state.model, MAX_BATCH) app.state.request_queue = janus.Queue() worker_thread = threading.Thread(target=worker_loop, args=(app,), daemon=True) @@ -169,6 +157,8 @@ def worker_loop(app): batch.append(req) except queue.Empty: break + + print(f"infering {len(batch)} tasks") output_tokens = app.state.model.batch_infer_one_round(batch) for task, token in zip(batch, output_tokens): task.output(token) @@ -181,15 +171,18 @@ def worker_loop(app): def build_task(id_, request_data, request: Request): messages = request_data.get("messages", []) - input_content = request.app.state.model.tokenizer.apply_chat_template( - conversation=messages, + inputs = request.app.state.model.processor.apply_chat_template( + messages, + tokenize=True, add_generation_prompt=True, - tokenize=False, + return_dict=True, + return_tensors="pt", ) - tokens = request.app.state.model.tokenizer.encode(input_content) + inputs.pop("token_type_ids", None) + return AsyncInferTask( id_, - tokens, + inputs, request_data.get("max_tokens", request.app.state.model.max_context_len()), request_data.get("temperature", 1.0), request_data.get("top_k", 1), @@ -298,7 +291,7 @@ async def chat_completions(request: Request): if __name__ == "__main__": - uvicorn.run(App, host="0.0.0.0", port=8000) + uvicorn.run(App, host="0.0.0.0", port=8008) """ curl -N -H "Content-Type: application/json" \ diff --git a/scripts/libinfinicore_infer/__init__.py b/scripts/libinfinicore_infer/__init__.py index 8fc5f4db..0661d865 100644 --- a/scripts/libinfinicore_infer/__init__.py +++ b/scripts/libinfinicore_infer/__init__.py @@ -6,7 +6,17 @@ DeepSeekV3MetaCStruct, DeepSeekV3WeightsCStruct, DeepSeekV3WeightLoaderCStruct, - DeepSeekV3CacheCStruct, +) +from .qwen3vl import ( + Qwen3vlModel, + Qwen3vlMetaCStruct, + TextMetaCStruct, + VisMetaCStruct, + Qwen3vlWeightsCStruct, + Qwen3vlWeightLoaderCStruct, + Qwen3vlVisWeightLoaderCStruct, + Qwen3vlLangWeightLoaderCStruct, + Qwen3vlCacheCStruct, ) __all__ = [ @@ -23,5 +33,14 @@ "DeepSeekV3MetaCStruct", "DeepSeekV3WeightsCStruct", "DeepSeekV3WeightLoaderCStruct", + "Qwen3vlModel", + "Qwen3vlMetaCStruct", + "TextMetaCStruct", + "VisMetaCStruct", + "Qwen3vlWeightsCStruct", + "Qwen3vlWeightLoaderCStruct", + "Qwen3vlVisWeightLoaderCStruct", + "Qwen3vlLangWeightLoaderCStruct", + "Qwen3vlCacheCStruct", "ModelRegister", ] diff --git a/scripts/libinfinicore_infer/base.py b/scripts/libinfinicore_infer/base.py index bed65b2e..93ddf0f9 100644 --- a/scripts/libinfinicore_infer/base.py +++ b/scripts/libinfinicore_infer/base.py @@ -67,4 +67,5 @@ def _load_library(self): lib_path = os.path.join( os.environ.get("INFINI_ROOT"), "lib", "libinfinicore_infer.so" ) + print("loaded infini lib!") return ctypes.CDLL(lib_path) diff --git a/scripts/libinfinicore_infer/qwen3vl.py b/scripts/libinfinicore_infer/qwen3vl.py new file mode 100644 index 00000000..bc405770 --- /dev/null +++ b/scripts/libinfinicore_infer/qwen3vl.py @@ -0,0 +1,327 @@ +from .base import BaseModel, DataType, DeviceType, KVCacheCStruct, register_model +from ctypes import ( + c_size_t, + c_uint, + c_uint16, + c_int, + c_float, + c_void_p, + c_bool, + POINTER, + Structure, + CFUNCTYPE, +) + + +class TextMetaCStruct(Structure): + _fields_ = [ + ("bos_token_id", c_size_t), + ("eos_token_id", c_size_t), + ("head_dim", c_size_t), + ("hidden_size", c_size_t), + ("initializer_range", c_float), + ("_pad1", c_float), + ("intermediate_size", c_size_t), + ("max_tokens", c_size_t), + ("num_attention_heads", c_size_t), + ("num_hidden_layers", c_size_t), + ("num_key_value_heads", c_size_t), + ("rms_norm_eps", c_float), + ("_pad2", c_float), + ("mrope_section", c_size_t * 3), + ("rope_theta", c_size_t), + ("vocab_size", c_size_t), + ] + + +class VisMetaCStruct(Structure): + _fields_ = [ + ("depth", c_size_t), + ("deepstack_visual_indexes", c_size_t * 3), + ("hidden_size", c_size_t), + ("in_channels", c_size_t), + ("initializer_range", c_float), + ("_pad1", c_float), + ("intermediate_size", c_size_t), + ("num_heads", c_size_t), + ("num_position_embeddings", c_size_t), + ("out_hidden_size", c_size_t), + ("patch_size", c_size_t), + ("spatial_merge_size", c_size_t), + ("temporal_patch_size", c_size_t), + ] + + +class Qwen3vlMetaCStruct(Structure): + _fields_ = [ + ("dtype", DataType), + ("_pad_dtype", c_uint), + ("text_meta", TextMetaCStruct), + ("vis_meta", VisMetaCStruct), + # Token ids + ("image_token_id", c_size_t), + ("video_token_id", c_size_t), + ("vision_end_token_id", c_size_t), + ("vision_start_token_id", c_size_t), + ] + + +class Qwen3vlWeightsCStruct(Structure): + pass + + +class Qwen3vlModelCStruct(Structure): + pass + + +class Qwen3vlCacheCStruct(Structure): + pass + + +load_global_fn = CFUNCTYPE(None, POINTER(Qwen3vlWeightsCStruct), c_void_p) +load_layer_fn = CFUNCTYPE(None, POINTER(Qwen3vlWeightsCStruct), c_void_p, c_size_t) + + +class Qwen3vlLangWeightLoaderCStruct(Structure): + _fields_ = [ + # Global + ("load_input_embd", load_global_fn), + ("load_output_norm", load_global_fn), + ("load_output_embd", load_global_fn), + # Attention + ("load_attn_norm", load_layer_fn), + ("load_attn_q_norm", load_layer_fn), + ("load_attn_k_norm", load_layer_fn), + ("load_attn_qkv_proj", load_layer_fn), + ("load_attn_o_proj", load_layer_fn), + # MLP + ("load_mlp_norm", load_layer_fn), + ("load_mlp_gate_up", load_layer_fn), + ("load_mlp_down", load_layer_fn), + ] + + +class Qwen3vlVisWeightLoaderCStruct(Structure): + _fields_ = [ + # Patch embed + ("load_patch_embed_weight", load_global_fn), + ("load_patch_embed_bias", load_global_fn), + ("load_pos_embed_weight", load_global_fn), + # Blocks attention + ("load_attn_proj_weight", load_layer_fn), + ("load_attn_proj_bias", load_layer_fn), + ("load_attn_qkv_weight", load_layer_fn), + ("load_attn_qkv_bias", load_layer_fn), + # Blocks MLP + ("load_mlp_linear_fc1_weight", load_layer_fn), + ("load_mlp_linear_fc1_bias", load_layer_fn), + ("load_mlp_linear_fc2_weight", load_layer_fn), + ("load_mlp_linear_fc2_bias", load_layer_fn), + # Blocks norm + ("load_norm1_weight", load_layer_fn), + ("load_norm1_bias", load_layer_fn), + ("load_norm2_weight", load_layer_fn), + ("load_norm2_bias", load_layer_fn), + # Deepstack merger + ("load_deepstack_merger_linear_fc1_weight", load_layer_fn), + ("load_deepstack_merger_linear_fc1_bias", load_layer_fn), + ("load_deepstack_merger_linear_fc2_weight", load_layer_fn), + ("load_deepstack_merger_linear_fc2_bias", load_layer_fn), + ("load_deepstack_merger_norm_weight", load_layer_fn), + ("load_deepstack_merger_norm_bias", load_layer_fn), + # Merger + ("load_merger_linear_fc1_weight", load_global_fn), + ("load_merger_linear_fc1_bias", load_global_fn), + ("load_merger_linear_fc2_weight", load_global_fn), + ("load_merger_linear_fc2_bias", load_global_fn), + ("load_merger_norm_weight", load_global_fn), + ("load_merger_norm_bias", load_global_fn), + ] + + +class Qwen3vlWeightLoaderCStruct(Structure): + _fields_ = [ + ("lang_loader", Qwen3vlLangWeightLoaderCStruct), + ("vis_loader", Qwen3vlVisWeightLoaderCStruct), + ] + + +@register_model +class Qwen3vlModel(BaseModel): + @classmethod + def register_lib(cls, lib): + """Register Qwen3vl model functions with the library""" + lib.createQwen3vlWeightLoader.argtypes = [] + lib.createQwen3vlWeightLoader.restype = POINTER(Qwen3vlWeightLoaderCStruct) + + lib.createQwen3vlWeights.argtypes = [ + POINTER(Qwen3vlMetaCStruct), + DeviceType, + c_int, + POINTER(c_int), + c_bool, + ] + lib.createQwen3vlWeights.restype = POINTER(Qwen3vlWeightsCStruct) + + lib.createQwen3vlModel.argtypes = [ + POINTER(Qwen3vlMetaCStruct), + POINTER(Qwen3vlWeightsCStruct), + ] + lib.createQwen3vlModel.restype = POINTER(Qwen3vlModelCStruct) + + lib.destroyQwen3vlModel.argtypes = [POINTER(Qwen3vlModelCStruct)] + + lib.createQwen3vlCache.argtypes = [POINTER(Qwen3vlModelCStruct)] + lib.createQwen3vlCache.restype = POINTER(Qwen3vlCacheCStruct) + + lib.dropQwen3vlCache.argtypes = [ + POINTER(Qwen3vlModelCStruct), + POINTER(Qwen3vlCacheCStruct), + ] + + lib.inferBatchQwen3vl.argtypes = [ + POINTER(Qwen3vlModelCStruct), + POINTER(c_uint), + c_uint, + c_void_p, # pixel_values, + c_uint, # total_patches, + POINTER(c_uint), # image_grid_thw, + c_uint, # num_images, + c_void_p, # pixel_values_videos, + c_uint, # total_patches_videos, + POINTER(c_uint), # video_grid_thw, + c_uint, # num_videos, + c_uint, # patch_features, + POINTER(c_uint), + c_uint, + POINTER(c_uint), + POINTER(POINTER(Qwen3vlCacheCStruct)), + POINTER(c_float), + POINTER(c_uint), + POINTER(c_float), + POINTER(c_uint), + ] + + lib.forwardBatchQwen3vl.argtypes = [ + POINTER(Qwen3vlModelCStruct), + POINTER(c_uint), + c_uint, + c_void_p, # pixel_values, + c_uint, # total_patches, + POINTER(c_uint), # image_grid_thw, + c_uint, # num_images, + c_void_p, # pixel_values_videos, + c_uint, # total_patches_videos, + POINTER(c_uint), # video_grid_thw, + c_uint, # num_videos, + c_uint, # patch_features, + POINTER(c_uint), + c_uint, + POINTER(c_uint), + POINTER(POINTER(Qwen3vlCacheCStruct)), + c_void_p, + ] + + def create_weight_loader(self): + return self.lib.createQwen3vlWeightLoader() + + def create_weights(self, meta, device_type, ndev, dev_ids, transpose_weight): + return self.lib.createQwen3vlWeights(meta, device_type, ndev, dev_ids, transpose_weight) + + def create_model(self, meta, weights): + return self.lib.createQwen3vlModel(meta, weights) + + def destroy_model(self, model): + self.lib.destroyQwen3vlModel(model) + + def create_cache(self, model): + return self.lib.createQwen3vlCache(model) + + def drop_cache(self, model, cache): + self.lib.dropQwen3vlCache(model, cache) + + def infer_batch( + self, + model, + tokens, + ntok, + pixel_values, + total_patches, + image_grid_thw, + num_images, + pixel_values_videos, + total_patches_videos, + video_grid_thw, + num_videos, + patch_features, + req_lens, + nreq, + req_pos, + caches, + temperature, + topk, + topp, + output, + ): + self.lib.inferBatchQwen3vl( + model, + tokens, + ntok, + pixel_values, + total_patches, + image_grid_thw, + num_images, + pixel_values_videos, + total_patches_videos, + video_grid_thw, + num_videos, + patch_features, + req_lens, + nreq, + req_pos, + caches, + temperature, + topk, + topp, + output, + ) + + def forward_batch( + self, + model, + tokens, + ntok, + pixel_values, + total_patches, + image_grid_thw, + num_images, + pixel_values_videos, + total_patches_videos, + video_grid_thw, + num_videos, + patch_features, + req_lens, + nreq, + req_pos, + caches, + logits, + ): + self.lib.forwardBatchQwen3vl( + model, + tokens, + ntok, + pixel_values, + total_patches, + image_grid_thw, + num_images, + pixel_values_videos, + total_patches_videos, + video_grid_thw, + num_videos, + patch_features, + req_lens, + nreq, + req_pos, + caches, + logits, + ) \ No newline at end of file diff --git a/scripts/qwen3vl.py b/scripts/qwen3vl.py new file mode 100644 index 00000000..51433ebc --- /dev/null +++ b/scripts/qwen3vl.py @@ -0,0 +1,676 @@ +import ctypes +from typing import List, Sequence + +from tqdm import tqdm + +from libinfinicore_infer import ( + Qwen3vlModel, + Qwen3vlMetaCStruct, + TextMetaCStruct, + VisMetaCStruct, + Qwen3vlWeightsCStruct, + Qwen3vlCacheCStruct, + DataType, + DeviceType, +) +from infer_task import InferTask, KVCache + +from ctypes import POINTER, c_float, c_int, c_uint, c_uint16, c_void_p, byref, c_bool +import os +from pathlib import Path +import safetensors +import sys +import time +import json +import math +import torch +import transformers +torch.set_default_device("cpu") + + +class Qwen3vlLangWeightsNaming: + def input_embd(self): + return "model.language_model.embed_tokens.weight" + + def output_embd(self): + return "model.language_model.embed_tokens.weight" + + def output_norm(self): + return "model.language_model.norm.weight" + + def attn_norm(self, i): + return f"model.language_model.layers.{i}.input_layernorm.weight" + + def attn_q_proj(self, i): + return f"model.language_model.layers.{i}.self_attn.q_proj.weight" + + def attn_q_norm(self, i): + return f"model.language_model.layers.{i}.self_attn.q_norm.weight" + + def attn_k_proj(self, i): + return f"model.language_model.layers.{i}.self_attn.k_proj.weight" + + def attn_k_norm(self, i): + return f"model.language_model.layers.{i}.self_attn.k_norm.weight" + + def attn_o_proj(self, i): + return f"model.language_model.layers.{i}.self_attn.o_proj.weight" + + def attn_v_proj(self, i): + return f"model.language_model.layers.{i}.self_attn.v_proj.weight" + + def mlp_norm(self, i): + return f"model.language_model.layers.{i}.post_attention_layernorm.weight" + + def mlp_gate(self, i): + return f"model.language_model.layers.{i}.mlp.gate_proj.weight" + + def mlp_down(self, i): + return f"model.language_model.layers.{i}.mlp.down_proj.weight" + + def mlp_up(self, i): + return f"model.language_model.layers.{i}.mlp.up_proj.weight" + +class Qwen3vlVisWeightsNaming: + def patch_embed_weight(self): + return "model.visual.patch_embed.proj.weight" + def patch_embed_bias(self): + return "model.visual.patch_embed.proj.bias" + def pos_embed_weight(self): + return "model.visual.pos_embed.weight" + def attn_proj_weight(self,i): + return f"model.visual.blocks.{i}.attn.proj.weight" + def attn_proj_bias(self,i): + return f"model.visual.blocks.{i}.attn.proj.bias" + def attn_qkv_weight(self,i): + return f"model.visual.blocks.{i}.attn.qkv.weight" + def attn_qkv_bias(self,i): + return f"model.visual.blocks.{i}.attn.qkv.bias" + def mlp_linear_fc1_weight(self,i): + return f"model.visual.blocks.{i}.mlp.linear_fc1.weight" + def mlp_linear_fc1_bias(self,i): + return f"model.visual.blocks.{i}.mlp.linear_fc1.bias" + def mlp_linear_fc2_weight(self,i): + return f"model.visual.blocks.{i}.mlp.linear_fc2.weight" + def mlp_linear_fc2_bias(self,i): + return f"model.visual.blocks.{i}.mlp.linear_fc2.bias" + def norm1_weight(self,i): + return f"model.visual.blocks.{i}.norm1.weight" + def norm1_bias(self,i): + return f"model.visual.blocks.{i}.norm1.bias" + def norm2_weight(self,i): + return f"model.visual.blocks.{i}.norm2.weight" + def norm2_bias(self,i): + return f"model.visual.blocks.{i}.norm2.bias" + def deepstack_merger_linear_fc1_weight(self,i): + return f"model.visual.deepstack_merger_list.{i}.linear_fc1.weight" + def deepstack_merger_linear_fc1_bias(self,i): + return f"model.visual.deepstack_merger_list.{i}.linear_fc1.bias" + def deepstack_merger_linear_fc2_weight(self,i): + return f"model.visual.deepstack_merger_list.{i}.linear_fc2.weight" + def deepstack_merger_linear_fc2_bias(self,i): + return f"model.visual.deepstack_merger_list.{i}.linear_fc2.bias" + def deepstack_merger_norm_weight(self,i): + return f"model.visual.deepstack_merger_list.{i}.norm.weight" + def deepstack_merger_norm_bias(self,i): + return f"model.visual.deepstack_merger_list.{i}.norm.bias" + + def merger_linear_fc1_weight(self): + return "model.visual.merger.linear_fc1.weight" + def merger_linear_fc1_bias(self): + return "model.visual.merger.linear_fc1.bias" + def merger_linear_fc2_weight(self): + return "model.visual.merger.linear_fc2.weight" + def merger_linear_fc2_bias(self): + return "model.visual.merger.linear_fc2.bias" + def merger_norm_weight(self): + return "model.visual.merger.norm.weight" + def merger_norm_bias(self): + return "model.visual.merger.norm.bias" + +class Qwen3vlMeta(Qwen3vlMetaCStruct): + def __init__(self, config, max_tokens=None): + + if config['text_config']['dtype'] == 'float16': + dt_ = DataType.INFINI_DTYPE_F16 + self.torch_dtype = torch.float16 + elif config['text_config']['dtype'] == 'float32': + dt_ = DataType.INFINI_DTYPE_F32 + self.torch_dtype = torch.float32 + elif config['text_config']['dtype'] == 'bfloat16': + dt_ = DataType.INFINI_DTYPE_BF16 + self.torch_dtype = torch.bfloat16 + else: + raise ValueError(f"Unsupported text dtype: {config['text_config']['dtype']}") + + super().__init__( + dtype = dt_, + image_token_id = config['image_token_id'], + video_token_id = config['video_token_id'], + vision_end_token_id = config['vision_end_token_id'], + vision_start_token_id = config['vision_start_token_id'], + text_meta = TextMetaCStruct( + bos_token_id = config['text_config']['bos_token_id'], + eos_token_id = config['text_config']['eos_token_id'], + head_dim = config['text_config']['head_dim'], + hidden_size = config['text_config']['hidden_size'], + initializer_range = config['text_config']['initializer_range'], + intermediate_size = config['text_config']['intermediate_size'], + max_tokens = (config['text_config']['max_position_embeddings'] if max_tokens is None else max_tokens), + num_attention_heads = config['text_config']['num_attention_heads'], + num_hidden_layers = config['text_config']['num_hidden_layers'], + num_key_value_heads = config['text_config']['num_key_value_heads'], + rms_norm_eps = config['text_config']['rms_norm_eps'], + mrope_section = (ctypes.c_ulong * 3)(*config['text_config']['rope_scaling']['mrope_section']), + rope_theta = config['text_config']['rope_theta'], + vocab_size = config['text_config']['vocab_size'], + ), + vis_meta = VisMetaCStruct( + depth = config['vision_config']['depth'], + deepstack_visual_indexes = (ctypes.c_ulong * 3)(*config['vision_config']['deepstack_visual_indexes']), + hidden_size = config['vision_config']['hidden_size'], + in_channels = config['vision_config']['in_channels'], + initializer_range = config['vision_config']['initializer_range'], + intermediate_size = config['vision_config']['intermediate_size'], + num_heads = config['vision_config']['num_heads'], + num_position_embeddings = config['vision_config']['num_position_embeddings'], + out_hidden_size = config['vision_config']['out_hidden_size'], + patch_size = config['vision_config']['patch_size'], + spatial_merge_size = config['vision_config']['spatial_merge_size'], + temporal_patch_size = config['vision_config']['temporal_patch_size'] + ) + ) + +def load_specific_tensor(model_dir, tensor_name): + """ + Load a specific tensor from a safetensors model. + Supports both sharded models (with index.json) and single file models. + """ + + # Try to load from individual .safetensors files + safetensors_files = [f for f in os.listdir(model_dir) if f.endswith(".safetensors")] + if not safetensors_files: + raise FileNotFoundError(f"No .safetensors files found in {model_dir}") + + # Try to find the tensor in each file + for filename in safetensors_files: + tensor_file = os.path.join(model_dir, filename) + try: + with safetensors.safe_open(tensor_file, framework="pt", device="cpu") as f: + if tensor_name in f.keys(): + tensor = f.get_tensor(tensor_name) + return tensor + except Exception: + continue + + # If we reach here, tensor was not found in any file + raise KeyError(f"{tensor_name} not found in any .safetensors files") + +def load_Qwen3vl_weights( + meta: Qwen3vlMeta, + weights, + model_path: str, + ndev: int, +): + # torch load weights, and reshape for qkv_proj / mlp_gate_up stack, attn / mlp parallel + # weight loader function load from specific offset according to idev, and transpose + model_instance = Qwen3vlModel() + weight_loader = model_instance.create_weight_loader() + vis_names = Qwen3vlVisWeightsNaming() + lang_names = Qwen3vlLangWeightsNaming() + + nkvh = meta.text_meta.num_key_value_heads + nh = meta.text_meta.num_attention_heads + dh = meta.text_meta.head_dim + d = meta.text_meta.hidden_size + di = meta.text_meta.intermediate_size + + assert nh % nkvh == 0 + assert nh % ndev == 0 + assert nkvh % ndev == 0 + assert di % ndev == 0 + + # ------------------------------- + # Language_model weights + # ------------------------------- + input_embd = load_specific_tensor(model_path, lang_names.input_embd()).to(meta.torch_dtype) + weight_loader.contents.lang_loader.load_input_embd(weights, input_embd.data_ptr()) + del input_embd + + output_norm = load_specific_tensor(model_path, lang_names.output_norm()).to(meta.torch_dtype) + weight_loader.contents.lang_loader.load_output_norm(weights, output_norm.data_ptr()) + del output_norm + + output_embd = load_specific_tensor(model_path, lang_names.output_embd()).to(meta.torch_dtype) + weight_loader.contents.lang_loader.load_output_embd(weights, output_embd.data_ptr()) + del output_embd + + for i in range(meta.text_meta.num_hidden_layers): + attn_norm = load_specific_tensor(model_path, lang_names.attn_norm(i)).to(meta.torch_dtype) + weight_loader.contents.lang_loader.load_attn_norm(weights, attn_norm.data_ptr(), i) + del attn_norm + + attn_q_proj = load_specific_tensor(model_path, lang_names.attn_q_proj(i)) + attn_k_proj = load_specific_tensor(model_path, lang_names.attn_k_proj(i)) + attn_v_proj = load_specific_tensor(model_path, lang_names.attn_v_proj(i)) + + _Q = attn_q_proj.reshape(nh,dh,d) + _K = attn_k_proj.reshape(nkvh,dh,d) + _V = attn_v_proj.reshape(nkvh,dh,d) + + qkv_proj = [] + _nh = nh // ndev + _nkvh = nkvh // ndev + for _idev in range(ndev): + qkv_proj.append(_Q[_idev * _nh : (_idev + 1) * _nh, :, :]) + qkv_proj.append(_K[_idev * _nkvh : (_idev + 1) * _nkvh, :, :]) + qkv_proj.append(_V[_idev * _nkvh : (_idev + 1) * _nkvh, :, :]) + attn_qkv_proj = torch.cat(qkv_proj, dim=0).to(meta.torch_dtype).contiguous() + + weight_loader.contents.lang_loader.load_attn_qkv_proj(weights, attn_qkv_proj.data_ptr(), i) + del attn_qkv_proj + + attn_q_norm = load_specific_tensor(model_path, lang_names.attn_q_norm(i)).to(meta.torch_dtype) + weight_loader.contents.lang_loader.load_attn_q_norm(weights, attn_q_norm.data_ptr(), i) + del attn_q_norm + + attn_k_norm = load_specific_tensor(model_path, lang_names.attn_k_norm(i)).to(meta.torch_dtype) + weight_loader.contents.lang_loader.load_attn_k_norm(weights, attn_k_norm.data_ptr(), i) + del attn_k_norm + + attn_o_proj = load_specific_tensor(model_path, lang_names.attn_o_proj(i)) + attn_o_proj = attn_o_proj.to(meta.torch_dtype).reshape([d, ndev, nh // ndev * dh]).transpose(0, 1).contiguous() + weight_loader.contents.lang_loader.load_attn_o_proj(weights, attn_o_proj.data_ptr(), i) + del attn_o_proj + + mlp_norm = load_specific_tensor(model_path, lang_names.mlp_norm(i)).to(meta.torch_dtype) + weight_loader.contents.lang_loader.load_mlp_norm(weights, mlp_norm.data_ptr(), i) + del mlp_norm + + mlp_gate = load_specific_tensor(model_path, lang_names.mlp_gate(i)) + mlp_up = load_specific_tensor(model_path, lang_names.mlp_up(i)) + + gate_up = [] + _di = di // ndev + for _idev in range(ndev): + _start = _idev * _di + _end = (_idev + 1) * _di + gate_up.append(mlp_gate[_start:_end, :]) + gate_up.append(mlp_up[_start:_end, :]) + mlp_gate_up = torch.cat(gate_up, dim=0).to(meta.torch_dtype).contiguous() + + weight_loader.contents.lang_loader.load_mlp_gate_up(weights, mlp_gate_up.data_ptr(), i) + del mlp_gate_up + + mlp_down = load_specific_tensor(model_path, lang_names.mlp_down(i)) + mlp_down = mlp_down.to(meta.torch_dtype).reshape([d, ndev, di // ndev]).transpose(0, 1).contiguous() + weight_loader.contents.lang_loader.load_mlp_down(weights, mlp_down.data_ptr(), i) + del mlp_down + + # ------------------------------- + # Vision head weights + # ------------------------------- + patch_embed_weight = load_specific_tensor(model_path, vis_names.patch_embed_weight()).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_patch_embed_weight(weights, patch_embed_weight.data_ptr()) + del patch_embed_weight + + patch_embed_bias = load_specific_tensor(model_path, vis_names.patch_embed_bias()).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_patch_embed_bias(weights, patch_embed_bias.data_ptr()) + del patch_embed_bias + + pos_embed_weight = load_specific_tensor(model_path, vis_names.pos_embed_weight()).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_pos_embed_weight(weights, pos_embed_weight.data_ptr()) + del pos_embed_weight + + for i in range(meta.vis_meta.depth): + attn_proj_weight = load_specific_tensor(model_path, vis_names.attn_proj_weight(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_attn_proj_weight(weights, attn_proj_weight.data_ptr(), i) + del attn_proj_weight + + attn_proj_bias = load_specific_tensor(model_path, vis_names.attn_proj_bias(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_attn_proj_bias(weights, attn_proj_bias.data_ptr(), i) + del attn_proj_bias + + attn_qkv_weight = load_specific_tensor(model_path, vis_names.attn_qkv_weight(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_attn_qkv_weight(weights, attn_qkv_weight.data_ptr(), i) + del attn_qkv_weight + + attn_qkv_bias = load_specific_tensor(model_path, vis_names.attn_qkv_bias(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_attn_qkv_bias(weights, attn_qkv_bias.data_ptr(), i) + del attn_qkv_bias + + mlp_linear_fc1_weight = load_specific_tensor(model_path, vis_names.mlp_linear_fc1_weight(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_mlp_linear_fc1_weight(weights, mlp_linear_fc1_weight.data_ptr(), i) + del mlp_linear_fc1_weight + + mlp_linear_fc1_bias = load_specific_tensor(model_path, vis_names.mlp_linear_fc1_bias(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_mlp_linear_fc1_bias(weights, mlp_linear_fc1_bias.data_ptr(), i) + del mlp_linear_fc1_bias + + mlp_linear_fc2_weight = load_specific_tensor(model_path, vis_names.mlp_linear_fc2_weight(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_mlp_linear_fc2_weight(weights, mlp_linear_fc2_weight.data_ptr(), i) + del mlp_linear_fc2_weight + + mlp_linear_fc2_bias = load_specific_tensor(model_path, vis_names.mlp_linear_fc2_bias(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_mlp_linear_fc2_bias(weights, mlp_linear_fc2_bias.data_ptr(), i) + del mlp_linear_fc2_bias + + norm1_weight = load_specific_tensor(model_path, vis_names.norm1_weight(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_norm1_weight(weights, norm1_weight.data_ptr(), i) + del norm1_weight + + norm1_bias = load_specific_tensor(model_path, vis_names.norm1_bias(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_norm1_bias(weights, norm1_bias.data_ptr(), i) + del norm1_bias + + norm2_weight = load_specific_tensor(model_path, vis_names.norm2_weight(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_norm2_weight(weights, norm2_weight.data_ptr(), i) + del norm2_weight + + norm2_bias = load_specific_tensor(model_path, vis_names.norm2_bias(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_norm2_bias(weights, norm2_bias.data_ptr(), i) + del norm2_bias + + for i in range(len(meta.vis_meta.deepstack_visual_indexes)): + deepstack_merger_linear_fc1_weight = load_specific_tensor(model_path, vis_names.deepstack_merger_linear_fc1_weight(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_deepstack_merger_linear_fc1_weight(weights, deepstack_merger_linear_fc1_weight.data_ptr(), i) + del deepstack_merger_linear_fc1_weight + + deepstack_merger_linear_fc1_bias = load_specific_tensor(model_path, vis_names.deepstack_merger_linear_fc1_bias(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_deepstack_merger_linear_fc1_bias(weights, deepstack_merger_linear_fc1_bias.data_ptr(), i) + del deepstack_merger_linear_fc1_bias + + deepstack_merger_linear_fc2_weight = load_specific_tensor(model_path, vis_names.deepstack_merger_linear_fc2_weight(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_deepstack_merger_linear_fc2_weight(weights, deepstack_merger_linear_fc2_weight.data_ptr(), i) + del deepstack_merger_linear_fc2_weight + + deepstack_merger_linear_fc2_bias = load_specific_tensor(model_path, vis_names.deepstack_merger_linear_fc2_bias(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_deepstack_merger_linear_fc2_bias(weights, deepstack_merger_linear_fc2_bias.data_ptr(), i) + del deepstack_merger_linear_fc2_bias + + deepstack_merger_norm_weight = load_specific_tensor(model_path, vis_names.deepstack_merger_norm_weight(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_deepstack_merger_norm_weight(weights, deepstack_merger_norm_weight.data_ptr(), i) + del deepstack_merger_norm_weight + + deepstack_merger_norm_bias = load_specific_tensor(model_path, vis_names.deepstack_merger_norm_bias(i)).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_deepstack_merger_norm_bias(weights, deepstack_merger_norm_bias.data_ptr(), i) + del deepstack_merger_norm_bias + + merger_linear_fc1_weight = load_specific_tensor(model_path, vis_names.merger_linear_fc1_weight()).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_merger_linear_fc1_weight(weights, merger_linear_fc1_weight.data_ptr()) + del merger_linear_fc1_weight + + merger_linear_fc1_bias = load_specific_tensor(model_path, vis_names.merger_linear_fc1_bias()).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_merger_linear_fc1_bias(weights, merger_linear_fc1_bias.data_ptr()) + del merger_linear_fc1_bias + + merger_linear_fc2_weight = load_specific_tensor(model_path, vis_names.merger_linear_fc2_weight()).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_merger_linear_fc2_weight(weights, merger_linear_fc2_weight.data_ptr()) + del merger_linear_fc2_weight + + merger_linear_fc2_bias = load_specific_tensor(model_path, vis_names.merger_linear_fc2_bias()).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_merger_linear_fc2_bias(weights, merger_linear_fc2_bias.data_ptr()) + del merger_linear_fc2_bias + + merger_norm_weight = load_specific_tensor(model_path, vis_names.merger_norm_weight()).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_merger_norm_weight(weights, merger_norm_weight.data_ptr()) + del merger_norm_weight + + merger_norm_bias = load_specific_tensor(model_path, vis_names.merger_norm_bias()).to(meta.torch_dtype) + weight_loader.contents.vis_loader.load_merger_norm_bias(weights, merger_norm_bias.data_ptr()) + del merger_norm_bias + + +class Qwen3vlBatchedTask: + def __init__(self, tasks: List[InferTask]): + self.tasks = tasks + self.nreq = len(tasks) + + # Precompute fields + token_lists = [t.tokens for t in tasks] + self.req_lens_list = [len(toks) for toks in token_lists] + self.req_pos_list = [t.pos for t in tasks] + self.kv_cache_ptrs = [t.kvcache().data() for t in tasks] + self.temperaturas_list = [t.temperature for t in tasks] + self.topks_list = [t.topk for t in tasks] + self.topps_list = [t.topp for t in tasks] + + # Flatten token lists + flat_tokens = [tok for toks in token_lists for tok in toks] + self.ntok = len(flat_tokens) + + # Convert to ctypes arrays in one pass + self.tokens = (c_uint * self.ntok)(*flat_tokens) + self.req_lens = (c_uint * self.nreq)(*self.req_lens_list) + self.req_pos = (c_uint * self.nreq)(*self.req_pos_list) + self.kv_caches = (POINTER(Qwen3vlCacheCStruct) * self.nreq)( + *self.kv_cache_ptrs + ) + self.temperaturas = (c_float * self.nreq)(*self.temperaturas_list) + self.topks = (c_uint * self.nreq)(*self.topks_list) + self.topps = (c_float * self.nreq)(*self.topps_list) + + # initialize visual encoder inputs + self.pixel_values = None + self.total_patches = 0 + self.image_grid_thw = None + self.num_images = 0 + self.pixel_values_videos = None + self.total_patches_videos = 0 + self.video_grid_thw = None + self.num_videos = 0 + self.patch_features = 0 + + # Prepare visual encoder inputs + all_pixel_values = [t.inputs['pixel_values'] for t in tasks if 'pixel_values' in t.inputs] + all_image_grid_thw = [t.inputs['image_grid_thw'] for t in tasks if 'image_grid_thw' in t.inputs] + all_pixel_values_videos = [t.inputs['pixel_values_videos'] for t in tasks if 'pixel_values_videos' in t.inputs] + all_video_grid_thw = [t.inputs['video_grid_thw'] for t in tasks if 'video_grid_thw' in t.inputs] + + if all_pixel_values: + concat_pixel_values = torch.cat(all_pixel_values, dim=0) # (total_patches, features) + self.total_patches = concat_pixel_values.shape[0] + self.patch_features = concat_pixel_values.shape[1] + self.flat_pixels = concat_pixel_values.flatten().to(torch.bfloat16).contiguous() + self.pixel_values = self.flat_pixels.ctypes.data_as(c_void_p) + + if all_image_grid_thw: + concat_grid_thw = torch.cat(all_image_grid_thw, dim=0) # (total_images, 3) + self.num_images = concat_grid_thw.shape[0] + flat_grid = concat_grid_thw.flatten().to(torch.int32).contiguous() + self.image_grid_thw = (c_uint * len(flat_grid))(*flat_grid.tolist()) + + if all_pixel_values_videos: + concat_pixel_values_videos = torch.cat(all_pixel_values_videos, dim=0) # (total_patches_videos, features) + self.total_patches_videos = concat_pixel_values_videos.shape[0] + self.patch_features_videos = concat_pixel_values_videos.shape[1] + print(self.patch_features_videos, flush=True) + self.flat_pixels_videos = concat_pixel_values_videos.flatten().to(torch.bfloat16).contiguous() + self.pixel_values_videos = self.flat_pixels_videos.ctypes.data_as(c_void_p) + + if all_video_grid_thw: + concat_grid_thw_videos = torch.cat(all_video_grid_thw, dim=0) # (total_videos, 3) + self.num_videos = concat_grid_thw_videos.shape[0] + flat_grid_videos = concat_grid_thw_videos.flatten().to(torch.int32).contiguous() + self.video_grid_thw = (c_uint * len(flat_grid_videos))(*flat_grid_videos.tolist()) + + + def input_args(self): + return ( + self.tokens, + self.ntok, + self.pixel_values, + self.total_patches, + self.image_grid_thw, + self.num_images, + self.pixel_values_videos, + self.total_patches_videos, + self.video_grid_thw, + self.num_videos, + self.patch_features, + self.req_lens, + self.nreq, + self.req_pos, + self.kv_caches, + self.temperaturas, + self.topks, + self.topps, + ) + +# 需要处理 visual encoder的cache 和 image video输入 +class Qwen3vlForCauslLM: + def __init__( + self, model_dir_path, device=DeviceType.DEVICE_TYPE_CPU, ndev=1, max_tokens=None + ): + with open(os.path.join(model_dir_path, "config.json"), "r") as f: + config = json.load(f) + self.config = config + eos_token_id = self.config["text_config"]["eos_token_id"] + self.eos_token_id = ( + [eos_token_id] if type(eos_token_id) == int else eos_token_id + ) + + print(model_dir_path) + + if "qwen3_vl" == config["model_type"]: + self.meta = Qwen3vlMeta( + config, max_tokens=max_tokens + ) + self.processor = transformers.AutoProcessor.from_pretrained(model_dir_path) + self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir_path) + else: + raise ValueError("Unsupported model architecture") + + print(f"Creating model on {ndev} devices...") + load_start_time = time.time() + dev_ids = (c_int * ndev)(*[i for i in range(ndev)]) + + self.model_instance = Qwen3vlModel() + weights = self.model_instance.create_weights( + byref(self.meta), + device, + ndev, + dev_ids, + c_bool(True) + ) + print("Loading weights...") + # Load weights from host + load_Qwen3vl_weights(self.meta, weights, model_dir_path, ndev) + # Create model instance + self.model_ptr = self.model_instance.create_model( + byref(self.meta), + weights, + ) + load_end_time = time.time() + print(f"Time used: {load_end_time - load_start_time:.3f}s") + + def max_context_len(self): + return self.meta.text_meta.max_tokens + + def create_kv_cache(self): + return self.model_instance.create_cache(self.model_ptr) + + def drop_kv_cache(self, kv_cache): + self.model_instance.drop_cache(self.model_ptr, kv_cache) + + def batch_infer_one_round(self, tasks: List[InferTask]): + output = (c_uint * len(tasks))() + batch_inputs = Qwen3vlBatchedTask(tasks) + self.model_instance.infer_batch( + self.model_ptr, + *(batch_inputs.input_args()), + output, + ) + return list(output) + + def generate(self, input_content, max_steps, topp_=1.0, topk_=1, temperature_=1.0): + inputs = self.processor.apply_chat_template( + conversation = [{"role": "user","content": [{"type": "text", "text": input_content}]}], + tokenize=True, + add_generation_prompt=True, + return_dict=True, + return_tensors="pt", + ) + + infer_task = InferTask( + 0, + inputs, + self.max_context_len(), + temperature_, + topk_, + topp_, + self.eos_token_id, + ) + infer_task.bind_kvcache(KVCache(self)) + print(input_content, end="", flush=True) + steps = 0 + total_time = 0 + output_content = "" + + print(inputs['input_ids'][0].tolist(), flush=True) + + for step_i in range(max_steps): + start_time = time.time() + output_tokens = self.batch_infer_one_round([infer_task]) + print(output_tokens) + end_time = time.time() + steps += 1 + output_str = self.tokenizer.decode(output_tokens[0]) + output_content += output_str + print(output_str, end="", flush=True) + if output_tokens[0] in self.eos_token_id: + break + infer_task.next(output_tokens[0]) + + if step_i > 0: + total_time += end_time - start_time + + print("\n") + avg_time = total_time * 1000 / steps if steps > 0 else -1 + print(output_content, flush=True) + print(f"Time per step: {avg_time:.3f}ms") + + infer_task._kv_cache.drop(self) + return output_content, avg_time + + def destroy_model_instance(self): + self.model_instance.destroy_model(self.model_ptr) + print("Model destroyed") + + +def test(): + if len(sys.argv) < 3: + print( + "Usage: python qwen3vl.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore] [n_device]" + ) + sys.exit(1) + model_path = sys.argv[2] + device_type = DeviceType.DEVICE_TYPE_CPU + if sys.argv[1] == "--cpu": + device_type = DeviceType.DEVICE_TYPE_CPU + elif sys.argv[1] == "--nvidia": + device_type = DeviceType.DEVICE_TYPE_NVIDIA + elif sys.argv[1] == "--cambricon": + device_type = DeviceType.DEVICE_TYPE_CAMBRICON + elif sys.argv[1] == "--ascend": + device_type = DeviceType.DEVICE_TYPE_ASCEND + elif sys.argv[1] == "--metax": + device_type = DeviceType.DEVICE_TYPE_METAX + elif sys.argv[1] == "--moore": + device_type = DeviceType.DEVICE_TYPE_MOORE + elif sys.argv[1] == "--iluvatar": + device_type = DeviceType.DEVICE_TYPE_ILUVATAR + else: + print( + "Usage: python qwen3vl.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore] [n_device]" + ) + sys.exit(1) + + ndev = int(sys.argv[3]) if len(sys.argv) > 3 else 1 + model = Qwen3vlForCauslLM(model_path, device_type, ndev, max_tokens=1024) + model.generate("山东最高的山是?", 200) + model.destroy_model_instance() + + +if __name__ == "__main__": + test() \ No newline at end of file diff --git a/scripts/qwen3vl_test.py b/scripts/qwen3vl_test.py new file mode 100644 index 00000000..354008c1 --- /dev/null +++ b/scripts/qwen3vl_test.py @@ -0,0 +1,83 @@ +import torch +from transformers import Qwen3VLForConditionalGeneration, AutoProcessor, GenerationConfig +import os +import time + +# 加载模型和processor +# 修改为使用Qwen3VLForConditionalGeneration和AutoProcessor +model = Qwen3VLForConditionalGeneration.from_pretrained( + "/data/shared/models/Qwen3-VL-2B-Instruct/", + torch_dtype=torch.float16, + device_map="auto", + attn_implementation="sdpa", + trust_remote_code=True +) +processor = AutoProcessor.from_pretrained("/data/shared/models/Qwen3-VL-2B-Instruct/", trust_remote_code=True) + +# 设置生成配置以确保确定性生成 +model.generation_config = GenerationConfig.from_pretrained("/data/shared/models/Qwen3-VL-2B-Instruct/", trust_remote_code=True) +model.generation_config.do_sample = False # 关闭采样以确保确定性 +model.generation_config.max_new_tokens = 200 + +messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "山东最高的山是?" + } + ] + } +] +# messages = [ +# { +# "role":"user", +# "content":[ +# { +# "type":"image", +# "url": "/data/users/monitor1379/InfiniLM/010P00002405F02D94-1.jpg" +# }, +# { +# "type":"text", +# "text":"Describe this image." +# } +# ] +# } +# ] + +# 处理输入 +inputs = processor.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, + return_dict=True, + return_tensors="pt", +) + +inputs = {k: v.to(model.device) for k, v in inputs.items()} +inputs.pop("token_type_ids", None) + +# for k,v in inputs.items(): +# print(k) +# print(v.shape) +# print(v.dtype) +# print(v) + +# 添加时间统计逻辑 +start_time = time.time() +generated_ids = model.generate(**inputs, max_new_tokens=200, output_attentions=False, return_dict_in_generate=True) +end_time = time.time() + +total_time = end_time - start_time +num_steps = len(generated_ids.sequences[0]) - len(inputs['input_ids'][0]) # 减去输入长度得到生成步骤数 +avg_time = (total_time / num_steps) * 1000 # 转换为毫秒 + +generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids.sequences) +] +output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False +) +print(output_text[0]) +print(f"Time per step: {avg_time:.3f}ms") \ No newline at end of file diff --git a/scripts/test.py b/scripts/test.py new file mode 100644 index 00000000..96385c38 --- /dev/null +++ b/scripts/test.py @@ -0,0 +1,2 @@ +if __name__ == "__main__": + print("testing") \ No newline at end of file diff --git a/scripts/test_perf.py b/scripts/test_perf.py index a6b26f3b..474d7736 100644 --- a/scripts/test_perf.py +++ b/scripts/test_perf.py @@ -30,8 +30,8 @@ NUM_REQUESTS = 10 CONCURRENCY = 5 -API_URL = "http://127.0.0.1:8000" -MODEL = "FM9G-7B" +API_URL = "http://127.0.0.1:8008" +MODEL = "qwen3vl" async def benchmark_user(client, semaphore, queue, results, user_id, verbose): @@ -49,7 +49,7 @@ async def benchmark_user(client, semaphore, queue, results, user_id, verbose): start_time = time.time() stream = await client.chat.completions.create( model=MODEL, - messages=[{"role": "user", "content": question}], + messages=[{"role": "user","content": [{"type": "text", "text": question}]}], stream=True ) diff --git a/src/cache_manager/opcache_manager.hpp b/src/cache_manager/opcache_manager.hpp index 4c49e961..dab9f68e 100644 --- a/src/cache_manager/opcache_manager.hpp +++ b/src/cache_manager/opcache_manager.hpp @@ -153,6 +153,8 @@ class LRUDescriptorCache { class CacheManager { public: DECLARE_OP_CACHE(Add) + DECLARE_OP_CACHE(Conv) + DECLARE_OP_CACHE(Mul) DECLARE_OP_CACHE(RMSNorm) DECLARE_OP_CACHE(Gemm) DECLARE_OP_CACHE(RoPE) @@ -160,11 +162,14 @@ class CacheManager { DECLARE_OP_CACHE(CausalSoftmax) DECLARE_OP_CACHE(Topkrouter) DECLARE_OP_CACHE(SwiGLU) + DECLARE_OP_CACHE(Silu) DECLARE_OP_CACHE(RandomSample) DECLARE_OP_CACHE(DequantizeAWQ) CacheManager(size_t capacity = 100) : Add_cache(capacity, DESTROY_FUNC(Add)), + Conv_cache(capacity, DESTROY_FUNC(Conv)), + Mul_cache(capacity, DESTROY_FUNC(Mul)), RMSNorm_cache(capacity, DESTROY_FUNC(RMSNorm)), Gemm_cache(capacity, DESTROY_FUNC(Gemm)), RoPE_cache(capacity, DESTROY_FUNC(RoPE)), @@ -172,6 +177,7 @@ class CacheManager { CausalSoftmax_cache(capacity, DESTROY_FUNC(CausalSoftmax)), Topkrouter_cache(capacity, DESTROY_FUNC(Topkrouter)), SwiGLU_cache(capacity, DESTROY_FUNC(SwiGLU)), + Silu_cache(capacity, DESTROY_FUNC(Silu)), RandomSample_cache(capacity, DESTROY_FUNC(RandomSample)), DequantizeAWQ_cache(capacity, DESTROY_FUNC(DequantizeAWQ)) {} diff --git a/src/models/deepseek_v3/deepseek_v3.cpp b/src/models/deepseek_v3/deepseek_v3.cpp index 2c463035..c60ef9d7 100644 --- a/src/models/deepseek_v3/deepseek_v3.cpp +++ b/src/models/deepseek_v3/deepseek_v3.cpp @@ -103,8 +103,8 @@ void inferDeviceBatch(const DeepSeekV3Meta &meta, DeepSeekV3DeviceResource &rsrc auto batch_pos_ids = std::vector(ntok); size_t req_start = 0; for (uint32_t req = 0; req < nreq; req++) { - for (uint32_t i = 0; i < req_lens[req]; i++) { - batch_pos_ids[req_start + i] = req_pos[req] + i; + for (uint32_t i = 0; i < req_lens[req]; i++) { // req_len 本次query长度,req_pos 历史长度 + batch_pos_ids[req_start + i] = req_pos[req] + i; //batch_pos_ids 展平后每个token的pos } req_start += req_lens[req]; } diff --git a/src/models/inference_context.cpp b/src/models/inference_context.cpp index db5fda11..5ac5ee1f 100644 --- a/src/models/inference_context.cpp +++ b/src/models/inference_context.cpp @@ -33,6 +33,61 @@ void InferenceContext::add(std::shared_ptr c, c->data(), a->data(), b->data(), stream)); } +void InferenceContext::conv(std::shared_ptr y, + std::shared_ptr x, + std::shared_ptr w, + std::shared_ptr bias, + void *pads, + void *strides, + void *dilations, + size_t n) { + size_t key = CacheManager::createDescriptorKey(y, x, w, bias); + // Combine additional parameters into the key for unique identification + hash_combine(key, std::hash()(pads)); + hash_combine(key, std::hash()(strides)); + hash_combine(key, std::hash()(dilations)); + hash_combine(key, std::hash()(n)); + + infiniopConvDescriptor_t desc; + if (!cache_manager->getConvDescriptor(key, desc)) { + RUN_INFINI(infiniopCreateConvDescriptor( + op_handle, &desc, y->desc(), x->desc(), w->desc(), + bias ? bias->desc() : nullptr, pads, strides, dilations, n)); + cache_manager->putConvDescriptor(key, desc); + } + + size_t workspace_size = 0; + RUN_INFINI(infiniopGetConvWorkspaceSize(desc, &workspace_size)); + ensure_workspace(workspace_size); + void *workspace = workspace_storage->memory(); + + RUN_INFINI(infiniopConv( + desc, workspace, workspace_size, + y->data(), x->data(), w->data(), + bias ? bias->data() : nullptr, stream)); +} + +void InferenceContext::mul(std::shared_ptr c, + std::shared_ptr a, + std::shared_ptr b) { + size_t key = CacheManager::createDescriptorKey(c, a, b); + + infiniopMulDescriptor_t desc; + if (!cache_manager->getMulDescriptor(key, desc)) { + RUN_INFINI(infiniopCreateMulDescriptor(op_handle, &desc, c->desc(), a->desc(), b->desc())); + cache_manager->putMulDescriptor(key, desc); + } + + size_t workspace_size = 0; + RUN_INFINI(infiniopGetMulWorkspaceSize(desc, &workspace_size)); + ensure_workspace(workspace_size); + void *workspace = workspace_storage->memory(); + + RUN_INFINI(infiniopMul( + desc, workspace, workspace_size, + c->data(), a->data(), b->data(), stream)); +} + void InferenceContext::rmsnorm(std::shared_ptr y, std::shared_ptr x, std::shared_ptr w, @@ -189,6 +244,26 @@ void InferenceContext::swiglu(std::shared_ptr out, out->data(), up->data(), gate->data(), stream)); } +void InferenceContext::silu(std::shared_ptr out, + std::shared_ptr input) { + size_t key = CacheManager::createDescriptorKey(out, input); + + infiniopSiluDescriptor_t desc; + if (!cache_manager->getSiluDescriptor(key, desc)) { + RUN_INFINI(infiniopCreateSiluDescriptor( + op_handle, &desc, out->desc(), input->desc())); + cache_manager->putSiluDescriptor(key, desc); + } + + size_t workspace_size = 0; + RUN_INFINI(infiniopGetSiluWorkspaceSize(desc, &workspace_size)); + ensure_workspace(workspace_size); + void *workspace = workspace_storage->memory(); + + RUN_INFINI(infiniopSilu(desc, workspace, workspace_size, + out->data(), input->data(), stream)); +} + void InferenceContext::randomSample(std::shared_ptr out, std::shared_ptr prob, float random_val, float top_p, uint32_t top_k, float temperature) { diff --git a/src/models/inference_context.hpp b/src/models/inference_context.hpp index 0cf93f6f..c19f40a3 100644 --- a/src/models/inference_context.hpp +++ b/src/models/inference_context.hpp @@ -19,6 +19,14 @@ struct InferenceContext { void add(std::shared_ptr c, std::shared_ptr a, std::shared_ptr b); + void conv(std::shared_ptr y, + std::shared_ptr x, + std::shared_ptr w, + std::shared_ptr bias, + void *pads, void *strides, void *dilations, size_t n); + void mul(std::shared_ptr c, + std::shared_ptr a, + std::shared_ptr b); void rmsnorm(std::shared_ptr y, std::shared_ptr x, std::shared_ptr w, @@ -48,6 +56,8 @@ struct InferenceContext { void swiglu(std::shared_ptr out, std::shared_ptr up, std::shared_ptr gate); + void silu(std::shared_ptr out, + std::shared_ptr input); void randomSample(std::shared_ptr out, std::shared_ptr prob, float random_val, float top_p, uint32_t top_k, float temperature); @@ -81,6 +91,15 @@ inline void add(std::shared_ptr c, std::shared_ptr a, std::share getInferenceContext().add(c, a, b); } +inline void conv(std::shared_ptr y, std::shared_ptr x, std::shared_ptr w, std::shared_ptr bias, + void *pads, void *strides, void *dilations, size_t n) { + getInferenceContext().conv(y, x, w, bias, pads, strides, dilations, n); +} + +inline void mul(std::shared_ptr c, std::shared_ptr a, std::shared_ptr b) { + getInferenceContext().mul(c, a, b); +} + inline void rmsnorm(std::shared_ptr y, std::shared_ptr x, std::shared_ptr w, float epsilon) { getInferenceContext().rmsnorm(y, x, w, epsilon); @@ -131,6 +150,10 @@ inline void swiglu(std::shared_ptr out, std::shared_ptr up, getInferenceContext().swiglu(out, up, gate); } +inline void silu(std::shared_ptr out, std::shared_ptr input) { + getInferenceContext().silu(out, input); +} + inline void randomSample(std::shared_ptr out, std::shared_ptr prob, float random_val, float top_p, uint32_t top_k, float temperature) { getInferenceContext().randomSample(out, prob, random_val, top_p, top_k, temperature); diff --git a/src/models/jiuge/jiuge.cpp b/src/models/jiuge/jiuge.cpp index 41f8e5ea..b599d0d8 100644 --- a/src/models/jiuge/jiuge.cpp +++ b/src/models/jiuge/jiuge.cpp @@ -238,11 +238,13 @@ void inferDeviceBatch(const JiugeMeta &meta, JiugeDeviceResource &rsrc, rearrange(q_rearrange->slice(2, 0, seq_len), q); auto qk_gemm = qk_buf->slice(0, 0, nh * seq_len * total_len)->view({nkvh, ngroup * seq_len, total_len}); auto k_gemm = kv_caches[req]->k[idev][layer]->slice(0, 0, total_len)->permute({1, 2, 0}); + // [nkvh, ngroup * seq_len, dh] @ [nkvh, dh, total_len] = [nkvh, ngroup * seq_len, total_len] linear(qk_gemm, rearrange_q_buf->slice(1, 0, ngroup * seq_len), k_gemm, 1.f / float(sqrt(dh)), 0.f, nullptr, nullptr); // softmax auto qk_softmax = qk_gemm->view({nh, seq_len, total_len}); causalSoftmax(qk_softmax, qk_softmax); auto v_gemm = kv_caches[req]->v[idev][layer]->slice(0, 0, total_len)->permute({1, 0, 2}); + // [nkvh, ngroup * seq_len, total_len] @ [nkvh, total_len, dh] = [nkvh, ngroup * seq_len, dh] linear(attn_val_buf->slice(1, 0, ngroup * seq_len), qk_gemm, v_gemm, 1.f, 0.f, nullptr, nullptr); // rearrange attn val rearrange(o, attn_val_gemm->slice(2, 0, seq_len)); diff --git a/src/models/qwen3vl/qwen3vl.cpp b/src/models/qwen3vl/qwen3vl.cpp new file mode 100644 index 00000000..8a710108 --- /dev/null +++ b/src/models/qwen3vl/qwen3vl.cpp @@ -0,0 +1,720 @@ +#include "qwen3vl_impl.hpp" + +#include "../../tensor.hpp" +#include "../../utils.hpp" +#include "../inference_context.hpp" +#include "infinicore_infer.h" + +#include +#include +#include + +void createDeviceResource(Qwen3vlDeviceResource *rsrc, const Qwen3vlMeta *meta, + std::shared_ptr weights, + infiniDevice_t device, int idev, + int ndev, int dev_id, + infinicclComm_t comm) { + RUN_INFINI(infinirtSetDevice(device, dev_id)); + RUN_INFINI(infinirtStreamSynchronize(weights->load_stream)); + infiniopHandle_t handle; + infiniopCreateHandle(&handle); + infinirtStream_t stream; + infinirtStreamCreate(&stream); + + auto memory_pool = std::make_shared(); + + *rsrc = Qwen3vlDeviceResource{ + device, + dev_id, + handle, + weights, + stream, + comm, + memory_pool, + }; + RUN_INFINI(infinirtDeviceSynchronize()); +} + +void releaseDeviceResource(Qwen3vlDeviceResource &res) { + infinirtDeviceSynchronize(); + + res.weights.reset(); + + infiniopDestroyHandle(res.handle); + res.handle = nullptr; + infinirtStreamDestroy(res.stream); + res.stream = nullptr; + infinicclCommDestroy(res.comm); + res.comm = nullptr; +} + +inline std::shared_ptr get_custom_SinTable(const Qwen3vlMeta &meta, std::vector> &pos_ids ,uint32_t dim, size_t theta) { + // pos_ids shape:[seq, dim/2] , pos ids acting on each dim + auto unit = dsize(meta.dtype); + auto half_dim = dim/2; + size_t len = pos_ids.size(); + void *table = std::malloc(len * half_dim * unit); + + for (size_t i = 0; i (pos_ids[i][j]) / std::pow(theta, static_cast(j) / half_dim)); + if (meta.dtype == INFINI_DTYPE_F16) { + ((uint16_t *)table)[i * half_dim + j] = f32_to_f16(_cos); + } else if (meta.dtype == INFINI_DTYPE_BF16) { + ((uint16_t *)table)[i * half_dim + j] = f32_to_bf16(_cos); + } else if (meta.dtype == INFINI_DTYPE_F32) { + ((float *)table)[i * half_dim + j] = _cos; + } else { + std::cout << "unsupported data type" << std::endl; + exit(1); + } + } + } + auto shape = std::vector({len, half_dim}); + auto tensor = Tensor::weight(table, meta.dtype, shape); + std::free(table); + return tensor; +} + +inline std::shared_ptr get_custom_CosTable(const Qwen3vlMeta &meta, std::vector> &pos_ids ,uint32_t dim, size_t theta) { + // pos_ids shape:[seq, dim/2] , pos ids acting on each dim + auto unit = dsize(meta.dtype); + auto half_dim = dim/2; + size_t len = pos_ids.size(); + void *table = std::malloc(len * half_dim * unit); + + for (size_t i = 0; i (pos_ids[i][j]) / std::pow(theta, static_cast(j) / half_dim)); + if (meta.dtype == INFINI_DTYPE_F16) { + ((uint16_t *)table)[i * half_dim + j] = f32_to_f16(_cos); + } else if (meta.dtype == INFINI_DTYPE_BF16) { + ((uint16_t *)table)[i * half_dim + j] = f32_to_bf16(_cos); + } else if (meta.dtype == INFINI_DTYPE_F32) { + ((float *)table)[i * half_dim + j] = _cos; + } else { + std::cout << "unsupported data type" << std::endl; + exit(1); + } + } + } + auto shape = std::vector({len, half_dim}); + auto tensor = Tensor::weight(table, meta.dtype, shape); + std::free(table); + return tensor; +} + +inline std::shared_ptr fast_pos_embed_interpolate(const Qwen3vlMeta &meta, Qwen3vlDeviceResource &rsrc, + uint32_t* grid_thw, uint32_t num_batch, uint32_t total_patches) { + auto dtype = meta.dtype; + auto num_position_embeddings = meta.vis_meta.num_position_embeddings; + auto hidden_size = meta.vis_meta.hidden_size; + auto merge_size = meta.vis_meta.spatial_merge_size; + auto num_grid_per_side = static_cast(sqrt(num_position_embeddings)); + + uint32_t total_pixels_offset = 0; + std::shared_ptr patch_pos_embeds = Tensor::buffer(dtype,{total_patches, hidden_size},rsrc.memory_pool); + auto pos_embed_weight = rsrc.weights->w_vis->pos_embed_weight; + + std::vector> pos_embeds(4); + for (uint32_t i = 0; i < num_batch; ++i) { + uint32_t t = grid_thw[i * 3]; + uint32_t h = grid_thw[i * 3 + 1]; + uint32_t w = grid_thw[i * 3 + 2]; + auto weight_array = std::vector(h*w*hidden_size); + auto weight_tensor = Tensor::buffer(dtype,{h*w, hidden_size},rsrc.memory_pool); + + // 计算插值索引和权重 + std::vector> indices(4); + std::vector> weights(4); + + auto linspace = [](float start, float end, uint32_t num_points) -> std::vector { + std::vector res(num_points); + for (uint32_t i = 0; i < num_points; ++i) { + res[i] = start + (end - start) * i / (num_points - 1); + } + return res; + }; + + auto h_idxs = linspace(0, num_grid_per_side - 1, h); + auto w_idxs = linspace(0, num_grid_per_side - 1, w); + + for (uint32_t ih = 0; ih < h; ++ih) { + for (uint32_t iw = 0; iw < w; ++iw) { + float h_idx_f = h_idxs[ih], w_idx_f = w_idxs[iw]; + uint32_t h_idx_floor = static_cast(floor(h_idx_f)), + w_idx_floor = static_cast(floor(w_idx_f)); + uint32_t h_idx_ceil = std::min(static_cast(ceil(h_idx_f)), num_grid_per_side - 1), + w_idx_ceil = std::min(static_cast(ceil(w_idx_f)), num_grid_per_side - 1); + + float dh = h_idx_f - h_idx_floor, dw = w_idx_f - w_idx_floor; + + indices[0].push_back((h_idx_floor * num_grid_per_side) + w_idx_floor); + indices[1].push_back((h_idx_floor * num_grid_per_side) + w_idx_ceil); + indices[2].push_back((h_idx_ceil * num_grid_per_side) + w_idx_floor); + indices[3].push_back((h_idx_ceil * num_grid_per_side) + w_idx_ceil); + + weights[0].push_back((1 - dh) * (1 - dw)); + weights[1].push_back((1 - dh) * dw); + weights[2].push_back(dh * (1 - dw)); + weights[3].push_back(dh * dw); + } + } + + // 查表并加权求和 + for (int j = 0; j < 4; ++j) { + pos_embeds[j] = Tensor::buffer(dtype,{h*w, hidden_size},rsrc.memory_pool); + // 使用索引和权重获取对应位置嵌入,并乘以权重 + for(size_t i = 0; i < h*w; i++){ + rearrange(pos_embeds[j]->slice(0,i,1),pos_embed_weight->slice(0,indices[j][i],1)); + } + for(size_t i = 0; i < h*w; i++){ + uint16_t w_value = f32_to_bf16(weights[j][i]); + for(size_t k=0; k < hidden_size; k++){ + weight_array[i*hidden_size + k] = w_value; + } + } + RUN_INFINI(infinirtMemcpyAsync(weight_tensor->data(), weight_array.data(), sizeof(uint16_t)*h*w*hidden_size, + INFINIRT_MEMCPY_H2D, rsrc.stream)); + mul(pos_embeds[j],pos_embeds[j],weight_tensor); + } + + // 合并四个方向的结果 + auto patch_pos_embed = pos_embeds[0]; // [h*w, hidden_size] + for (int j = 1; j < 4; ++j) { + add(patch_pos_embed,patch_pos_embed, pos_embeds[j]); + } + + // 对于视频帧数T>1的情况,重复patch_pos_embed T次 + if (t > 1) { + auto temp_patch_pos_embed = Tensor::buffer(dtype,{t,h*w,hidden_size},rsrc.memory_pool); + for(size_t i = 0; i < t; i++){ + rearrange(temp_patch_pos_embed->slice(0,i,1), patch_pos_embed); + } + patch_pos_embed = temp_patch_pos_embed; + } + printf("merge patch pos embed/n"); + fflush(stdout); + patch_pos_embed = patch_pos_embed + ->view({t, h/merge_size, merge_size, w/merge_size, merge_size, hidden_size}) + ->permute({0, 1, 3, 2, 4, 5}) + ->view({t*h*w, hidden_size}); //可能因为内存不连续无法再view + + rearrange(patch_pos_embeds->slice(0,total_pixels_offset,t*h*w), patch_pos_embed); + total_pixels_offset += t*h*w; + } + return patch_pos_embeds; +} + +inline auto rot_pos_embed(const Qwen3vlMeta &meta, Qwen3vlDeviceResource &rsrc, uint32_t* grid_thw, uint32_t num_batch, uint32_t total_patches) { + auto dtype = meta.dtype; + auto hidden_size = meta.vis_meta.hidden_size; + auto num_heads = meta.vis_meta.num_heads; + auto head_dim = hidden_size / num_heads; + auto merge_size = meta.vis_meta.spatial_merge_size; + + std::vector> pos_ids_table_y ( + total_patches, + std::vector(head_dim/4) + ); + std::vector> pos_ids_table_x ( + total_patches, + std::vector(head_dim/4) + ); + for (uint32_t b = 0; b < num_batch; ++b) { + uint32_t offset = b * 3; + uint32_t num_frames = grid_thw[offset + 0]; + uint32_t height = grid_thw[offset + 1]; + uint32_t width = grid_thw[offset + 2]; + + uint32_t merged_h = height / merge_size; + uint32_t merged_w = width / merge_size; + + // 遍历所有块和块内位置 + size_t patch_offset = 0; + for (uint32_t bh = 0; bh < merged_h; ++bh) { + for (uint32_t bw = 0; bw < merged_w; ++bw) { + for (uint32_t ih = 0; ih < merge_size; ++ih) { + for (uint32_t iw = 0; iw < merge_size; ++iw) { + uint32_t row = bh * merge_size + ih; + uint32_t col = bw * merge_size + iw; + // 如果是多帧,重复 num_frames 次 + for (uint32_t f = 0; f < num_frames; ++f) { + size_t dim_offset = 0; + for(;dim_offsetslice(1,0,head_dim/4),sin_y); + auto sin_x = get_custom_SinTable(meta,pos_ids_table_x,head_dim/2,10000); + rearrange(sin->slice(1,head_dim/4,head_dim/2),sin_y); + auto cos = Tensor::buffer(dtype,{total_patches,head_dim/2},rsrc.memory_pool); + auto cos_y = get_custom_CosTable(meta,pos_ids_table_y,head_dim/2,10000); + rearrange(cos->slice(1,0,head_dim/4),cos_y); + auto cos_x = get_custom_CosTable(meta,pos_ids_table_x,head_dim/2,10000); + rearrange(cos->slice(1,head_dim/4,head_dim/2),cos_y); + + return std::pair{sin,cos}; +} + +void inferDeviceBatchVision(const Qwen3vlMeta &meta, Qwen3vlDeviceResource &rsrc, + uint32_t idev, uint32_t ndev, InferRequest &req) { + void *pixel_values = req.pixel_values; + uint32_t total_patches = req.total_patches; + uint32_t *image_grid_thw = req.image_grid_thw; + uint32_t num_images = req.num_images; + void *pixel_values_videos = req.pixel_values_videos; + uint32_t total_patches_videos = req.total_patches_videos; + //uint32_t *video_grid_thw = req.video_grid_thw; + //uint32_t num_videos = req.num_videos; + //uint32_t patch_features = req.patch_features; + + auto dtype = meta.dtype; + auto d = meta.vis_meta.hidden_size; + auto channels = meta.vis_meta.in_channels; + auto patch_size = meta.vis_meta.patch_size; + auto temporal_patch_size = meta.vis_meta.temporal_patch_size; + //auto stream = rsrc.stream; + auto weights = rsrc.weights; + + auto image_tensor = Tensor::weight(pixel_values, dtype, {total_patches, channels*temporal_patch_size*patch_size*patch_size}); + auto video_tensor = Tensor::weight(pixel_values_videos, dtype, {total_patches_videos, channels*temporal_patch_size*patch_size*patch_size}); + auto hidden_states = Tensor::buffer(dtype, {total_patches, d, 1, 1, 1}, rsrc.memory_pool); + + std::vector pads = {0, 0, 0}; + std::vector strides = {static_cast(temporal_patch_size), static_cast(patch_size), static_cast(patch_size)}; + std::vector dilations = {1, 1, 1}; + conv(hidden_states, image_tensor, rsrc.weights->w_vis->patch_embed_weight, rsrc.weights->w_vis->patch_embed_bias, + pads.data(), strides.data(), dilations.data(), 3); + hidden_states = hidden_states->view({total_patches, d}); + + auto pos_embeds = fast_pos_embed_interpolate(meta,rsrc,image_grid_thw,num_images,total_patches); + add(hidden_states,hidden_states,pos_embeds); + + auto [sin, cos] = rot_pos_embed(meta,rsrc,image_grid_thw,num_images,total_patches); + + +} + +void inferDeviceBatchText(const Qwen3vlMeta &meta, Qwen3vlDeviceResource &rsrc, + uint32_t idev, uint32_t ndev, InferRequest &req) { + const uint32_t *tokens = req.tokens; + uint32_t ntok = req.ntok; + const uint32_t *req_lens = req.req_lens; + uint32_t nreq = req.nreq; + const uint32_t *req_pos = req.req_pos; + struct Qwen3vlCache **caches = req.kv_caches; + const float *temperature = req.temperature; + const uint32_t *topk = req.topk; + const float *topp = req.topp; + uint32_t *output = req.output; + void *last_logits = req.logits; + + assert(meta.text_meta.num_attention_heads % ndev == 0); + assert(meta.text_meta.num_key_value_heads % ndev == 0); + + auto dtype = meta.dtype; + auto nlayer = meta.text_meta.num_hidden_layers; + size_t nh = meta.text_meta.num_attention_heads / size_t(ndev); + size_t nkvh = meta.text_meta.num_key_value_heads / size_t(ndev); + auto ngroup = nh / nkvh; + auto dh = meta.text_meta.head_dim; + auto d = meta.text_meta.hidden_size; + auto di = meta.text_meta.intermediate_size / size_t(ndev); + auto dvoc = meta.text_meta.vocab_size; + float epsilon = meta.text_meta.rms_norm_eps; + auto stream = rsrc.stream; + auto weights = rsrc.weights; + + //Allocate buffers + auto logits_in = Tensor::buffer(dtype, {ntok, d}, rsrc.memory_pool); + auto logits_out = Tensor::buffer(dtype, {ntok, d}, rsrc.memory_pool); + + //所有请求的当前token + auto qkv_buf = Tensor::buffer(dtype, {ntok, (nh + nkvh * 2) * dh}, rsrc.memory_pool); + auto o_buf = Tensor::buffer(dtype, {ntok, nh * dh}, rsrc.memory_pool); + auto gate_up_buf = Tensor::buffer(dtype, {ntok, 2*di}, rsrc.memory_pool); + + auto prob_buf = Tensor::buffer(dtype, {nreq, dvoc}, rsrc.memory_pool); + auto result_buf = Tensor::buffer(INFINI_DTYPE_I64, {nreq}, rsrc.memory_pool); + auto result_cpu = std::vector(nreq); + + auto qkv_rope = qkv_buf->view({ntok, nh + nkvh * 2, dh}); + auto q_buf = qkv_rope->slice(1, 0, nh); + auto k_buf = qkv_rope->slice(1, nh, nkvh); + + //Prepare inputs + auto batch_pos_ids = std::vector(ntok); + size_t req_start = 0; + for (uint32_t req = 0; req < nreq; req++) { + for (uint32_t i = 0; i < req_lens[req]; i++) { // req_len 本次query长度,req_pos 历史长度 + batch_pos_ids[req_start + i] = req_pos[req] + i; //batch_pos_ids 展平后每个token的pos + } + req_start += req_lens[req]; + } + std::shared_ptr pos_ids_buf; + if (rsrc.device == INFINI_DEVICE_CPU) { + pos_ids_buf = Tensor::weight(batch_pos_ids.data(), INFINI_DTYPE_U32, {ntok}); + } else { + pos_ids_buf = Tensor::buffer(INFINI_DTYPE_U32, {ntok}, rsrc.memory_pool); + RUN_INFINI(infinirtMemcpyAsync(pos_ids_buf->data(), batch_pos_ids.data(), sizeof(uint32_t) * ntok, + INFINIRT_MEMCPY_H2D, stream)); + } + + //convert tokens to embeddings + for (uint32_t i = 0; i < ntok; i++) { + RUN_INFINI(infinirtMemcpyAsync(logits_in->data(i * d), + weights->w_lang->in_embd->data(tokens[i] * d), + dsize(dtype) * d, INFINIRT_MEMCPY_D2D, stream)); + } + + // attention inner + size_t max_qk_size = 0; + size_t max_seq_len = 0; + + for (uint32_t req = 0; req < nreq; req++) { + auto past_len = req_pos[req]; + auto seq_len = req_lens[req]; + auto total_len = past_len + seq_len; + + max_qk_size = std::max(max_qk_size, size_t(seq_len * total_len)); + max_seq_len = std::max(max_seq_len, size_t(seq_len)); + } + + auto qk_buf = Tensor::buffer(dtype, {nh * max_qk_size}, rsrc.memory_pool); + auto rearrange_q_buf = Tensor::buffer(dtype, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool); + auto q_rearrange = rearrange_q_buf->view({nkvh, ngroup, max_seq_len, dh}); + auto attn_val_buf = Tensor::buffer(dtype, {nkvh, ngroup * max_seq_len, dh}, rsrc.memory_pool); + auto attn_val_gemm = attn_val_buf->view({nkvh, ngroup, max_seq_len, dh}); + + auto gate_buf = gate_up_buf->slice(1, 0, di); + auto up_buf = gate_up_buf->slice(1, di, di); + + //Compute + for (uint32_t i = 0; i < nlayer; i++){ + // attn norm + rmsnorm(logits_out,logits_in,weights->w_lang->layers[i].attn_norm,epsilon); + // qkv_proj + linear(qkv_buf,logits_out,weights->w_lang->layers[i].attn_qkv_proj,1.0,0.0,nullptr,nullptr); + // qk_norm + rmsnorm(q_buf,q_buf,weights->w_lang->layers[i].attn_q_norm,epsilon); + rmsnorm(k_buf,k_buf,weights->w_lang->layers[i].attn_k_norm,epsilon); + // rope + rope_v2(q_buf,q_buf,pos_ids_buf,weights->sin_table,weights->cos_table); + rope_v2(k_buf,k_buf,pos_ids_buf,weights->sin_table,weights->cos_table); + + // 逐个req处理 + size_t token_offset = 0; + for(uint32_t req=0; req < nreq; req++){ + auto past_len = req_pos[req]; + auto seq_len = req_lens[req]; + auto total_len = past_len + seq_len; + + auto o = o_buf->slice(0,token_offset,seq_len)->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3});// [nkvh, ngroup, seq_len, dh] + auto q = qkv_rope->slice({{0,token_offset,seq_len},{1,0,nh}})->view({seq_len, nkvh, ngroup, dh})->permute({1, 2, 0, 3});// [nkvh, ngroup, seq_len, dh] + auto k = qkv_rope->slice({{0,token_offset,seq_len},{1,nh,nkvh}});// [ntok, nkvh, dh] + auto v = qkv_rope->slice({{0,token_offset,seq_len},{1,nh+nkvh,nkvh}});// [ntok, nkvh, dh] + + // concat to cache + rearrange(caches[req]->k_rot[idev][i]->slice(0,past_len,seq_len),k); + rearrange(caches[req]->v[idev][i]->slice(0,past_len,seq_len),v); + + //fill full_k full_v + auto full_k_buff = caches[req]->k_rot[idev][i]->slice(0,0,total_len)->permute({1,2,0});// [nkvh, dh, total_len] + auto full_v_buff = caches[req]->v[idev][i]->slice(0,0,total_len)->permute({1,0,2});// [nkvh, total_len, dh] + + //self-attn + rearrange(q_rearrange->slice(2, 0, seq_len), q); + auto attn_score_req = qk_buf->slice(0,0,nh*seq_len*total_len)->view({nkvh, ngroup*seq_len, total_len}); + // [nkvh, ngroup * seq_len, dh] @ [nkvh, dh, total_len] = [nkvh, ngroup * seq_len, total_len] + linear(attn_score_req,rearrange_q_buf->slice(1, 0, ngroup * seq_len),full_k_buff,1.f / float(sqrt(dh)), 0.f, nullptr, nullptr); + // softmax + auto qk_softmax = attn_score_req->view({nh, seq_len, total_len}); + causalSoftmax(qk_softmax,qk_softmax); + // [nkvh, ngroup * seq_len, total_len] @ [nkvh, total_len, dh] = [nkvh, ngroup * seq_len, dh] + linear(attn_val_buf->slice(1, 0, ngroup * seq_len), attn_score_req, full_v_buff, 1.0, 0.0, nullptr, nullptr); + //printf("rearrage o; layer[%d]\n",i); + rearrange(o,attn_val_gemm->slice(2, 0, seq_len)); + token_offset += seq_len; + } + linear(logits_in, o_buf, weights->w_lang->layers[i].attn_o_proj, 1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); + // All_reduce if distributed + if (rsrc.comm != nullptr) { + RUN_INFINI(infinicclAllReduce( + logits_in->data(), logits_in->data(), ntok * d, dtype, + INFINICCL_SUM, rsrc.comm, stream)); + RUN_INFINI(infinirtStreamSynchronize(stream)); + } + + // mlp norm + rmsnorm(logits_out,logits_in,weights->w_lang->layers[i].mlp_norm,epsilon); + // mlp gate_up + linear(gate_up_buf,logits_out,weights->w_lang->layers[i].mlp_gate_up,1.0,0.0,nullptr,nullptr); + // silu + silu(gate_buf,gate_buf); + mul(gate_buf,gate_buf,up_buf); + // mlp down + linear(logits_in,gate_buf,weights->w_lang->layers[i].mlp_down,1.0, 0.0, idev == 0 ? logits_in : nullptr, nullptr); + // All_reduce if distributed + if (rsrc.comm != nullptr) { + RUN_INFINI(infinicclAllReduce( + logits_in->data(), logits_in->data(), ntok * d, dtype, + INFINICCL_SUM, rsrc.comm, stream)); + RUN_INFINI(infinirtStreamSynchronize(stream)); + } + } + // sample and output + if (idev == 0) { + if (last_logits != nullptr) { + rmsnorm(logits_out, logits_in, weights->w_lang->out_norm, epsilon); + auto last_logits_buf = Tensor::buffer(dtype, {ntok, dvoc}, rsrc.memory_pool); + linear(last_logits_buf, logits_out, weights->w_lang->out_embd, 1.0, 0.0, nullptr, nullptr); + RUN_INFINI(infinirtStreamSynchronize(stream)); + RUN_INFINI(infinirtMemcpy(last_logits, last_logits_buf->data(), dsize(dtype) * ntok * dvoc, INFINIRT_MEMCPY_D2H)); + } + if (output != nullptr) { + size_t token_offset = 0; + for (uint32_t req = 0; req < nreq; req++) { + auto seq_len = req_lens[req]; + token_offset += seq_len; + rmsnorm(logits_out->slice(0, req, 1), + logits_in->slice(0, token_offset - 1, 1), + weights->w_lang->out_norm, + epsilon); + } + linear(prob_buf, logits_out->slice(0, 0, nreq), weights->w_lang->out_embd, 1.0, 0.0, nullptr, nullptr); + std::random_device _rd; + std::mt19937 gen(_rd()); + token_offset = 0; + for (uint32_t req = 0; req < nreq; req++) { + auto seq_len = req_lens[req]; + float random_val = std::uniform_real_distribution(0, 1)(gen); + randomSample(result_buf->slice(0, req, 1)->view_as({}, {}), + prob_buf->slice(0, req, 1)->view_as({dvoc}, {1}), + random_val, topp[req], topk[req], temperature[req]); + token_offset += seq_len; + } + RUN_INFINI(infinirtStreamSynchronize(stream)); + RUN_INFINI(infinirtMemcpy(result_cpu.data(), result_buf->data(), + sizeof(int64_t) * nreq, INFINIRT_MEMCPY_D2H)); + for (uint32_t req = 0; req < nreq; req++) { + output[req] = uint32_t(result_cpu[req]); + } + } + } +} + +void inferDeviceBatch(const Qwen3vlMeta &meta, Qwen3vlDeviceResource &rsrc, + uint32_t idev, uint32_t ndev, InferState &state, InferRequest &req) { + // infer vision + sync + if (req.num_images > 0 || req.num_videos > 0){ + inferDeviceBatchVision(meta, rsrc, idev, ndev, req); + + std::unique_lock lock(state.mtx_sync); + state.sync_cnt--; + if (state.sync_cnt == 0) { + state.cv_sync.notify_all(); + } else { + state.cv_sync.wait(lock, [&] {return state.sync_cnt == 0;}); + } + } + // infer text + inferDeviceBatchText(meta, rsrc, idev, ndev, req); +} + +__C void +inferBatchQwen3vl(struct Qwen3vlModel *model, + const uint32_t *tokens, uint32_t ntok, + void *pixel_values, uint32_t total_patches, + uint32_t *image_grid_thw, uint32_t num_images, + void *pixel_values_videos, uint32_t total_patches_videos, + uint32_t *video_grid_thw, uint32_t num_videos, + uint32_t patch_features, + const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos, + struct Qwen3vlCache **kv_caches, + const float *temperature, const uint32_t *topk, const float *topp, + uint32_t *output) { + model->req.tokens = tokens; + model->req.ntok = ntok; + model->req.pixel_values = pixel_values; + model->req.total_patches = total_patches; + model->req.image_grid_thw = image_grid_thw; + model->req.num_images = num_images; + model->req.pixel_values_videos = pixel_values_videos; + model->req.total_patches_videos = total_patches_videos; + model->req.video_grid_thw = video_grid_thw; + model->req.num_videos = num_videos; + model->req.patch_features = patch_features; + model->req.req_lens = req_lens; + model->req.nreq = nreq; + model->req.req_pos = req_pos; + model->req.kv_caches = kv_caches; + model->req.output = output; + model->req.logits = nullptr; + model->req.temperature = temperature; + model->req.topk = topk; + model->req.topp = topp; + model->states[0].sync_cnt = model->dev_ids.size(); + + for (size_t idev = 0; idev < model->dev_ids.size(); idev++) { + std::unique_lock lock(model->states[idev].mtx); + model->states[idev].proceed = true; + lock.unlock(); + model->states[idev].cv_start.notify_one(); + } + for (size_t i = model->dev_ids.size(); i > 0; i--) { + auto idev = i - 1; + std::unique_lock lock(model->states[idev].mtx); + model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); }); + lock.unlock(); + } +} + +__C void +forwardBatchQwen3vl(struct Qwen3vlModel *model, + const uint32_t *tokens, uint32_t ntok, + void *pixel_values, uint32_t total_patches, + uint32_t *image_grid_thw, uint32_t num_images, + void *pixel_values_videos, uint32_t total_patches_videos, + uint32_t *video_grid_thw, uint32_t num_videos, + uint32_t patch_features, + const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos, + struct Qwen3vlCache **kv_caches, + void *logits) { + model->req.tokens = tokens; + model->req.ntok = ntok; + model->req.pixel_values = pixel_values; + model->req.total_patches = total_patches; + model->req.image_grid_thw = image_grid_thw; + model->req.num_images = num_images; + model->req.pixel_values_videos = pixel_values_videos; + model->req.total_patches_videos = total_patches_videos; + model->req.video_grid_thw = video_grid_thw; + model->req.num_videos = num_videos; + model->req.patch_features = patch_features; + model->req.req_lens = req_lens; + model->req.nreq = nreq; + model->req.req_pos = req_pos; + model->req.kv_caches = kv_caches; + model->req.output = nullptr; + model->req.logits = logits; + model->req.temperature = nullptr; + model->req.topk = nullptr; + model->req.topp = nullptr; + model->states[0].sync_cnt = model->dev_ids.size(); + + for (size_t idev = 0; idev < model->dev_ids.size(); idev++) { + std::unique_lock lock(model->states[idev].mtx); + model->states[idev].proceed = true; + lock.unlock(); + model->states[idev].cv_start.notify_one(); + } + for (size_t i = model->dev_ids.size(); i > 0; i--) { + auto idev = i - 1; + std::unique_lock lock(model->states[idev].mtx); + model->states[idev].cv_done.wait(lock, [&] { return !(model->states[idev].proceed); }); + lock.unlock(); + } +} + +void launchDevice(const Qwen3vlMeta &meta, std::shared_ptr weights, Qwen3vlDeviceResource *rsrc, InferState &state, InferRequest &req, + infiniDevice_t device, int idev, int ndev, int dev_id, infinicclComm_t comm) { + // Create Device Resource + createDeviceResource(rsrc, &meta, weights, device, idev, ndev, dev_id, comm); + + CacheManager cache_manager(100); + InferenceContext ctx(rsrc->handle, rsrc->memory_pool, &cache_manager, rsrc->stream); + + // Set the inference context for this thread + setInferenceContext(&ctx); + + { + std::unique_lock lock(state.mtx); + state.loaded = true; + lock.unlock(); + state.cv_load.notify_one(); + } + + // Infer Loop + while (true) { + std::unique_lock lock(state.mtx); + state.cv_start.wait(lock, [&] { return state.proceed || state.exit_flag; }); + // quit if exit_flag is set + if (state.exit_flag) { + break; + } + + inferDeviceBatch(meta, *rsrc, idev, ndev, state, req); + + state.proceed = false; + lock.unlock(); + state.cv_done.notify_one(); + } + + // Clean-Up + releaseDeviceResource(*rsrc); + setInferenceContext(nullptr); // Clear the context when done +} + + +Qwen3vlModel::Qwen3vlModel(const Qwen3vlMeta *_meta, const Qwen3vlWeights *weights) : meta(*_meta) { + auto device_weights = weights->device_weights; + int ndev = device_weights.size(); + device = device_weights[0]->device; + dev_ids.resize(ndev); + for (int i = 0; i < ndev; i++) { + dev_ids[i] = device_weights[i]->dev_id; + } + dev_resources = std::vector(ndev); + states = std::vector(ndev); + threads.resize(ndev); + RUN_INFINI(infinirtInit()); + auto comms = std::vector(ndev, nullptr); + if (ndev > 1) { + RUN_INFINI(infinicclCommInitAll(device, comms.data(), ndev, dev_ids.data())); + } + for (int i = 0; i < ndev; i++) { + threads[i] = std::thread(launchDevice, std::cref(meta), device_weights[i], &dev_resources[i], std::ref(states[i]), std::ref(req), device, i, ndev, dev_ids[i], comms[i]); + } + for (int i = 0; i < ndev; i++) { + std::unique_lock lock(states[i].mtx); + states[i].cv_load.wait(lock, [&] { return states[i].loaded; }); + lock.unlock(); + } +} + +__C struct Qwen3vlModel * +createQwen3vlModel(const Qwen3vlMeta *_meta, + const Qwen3vlWeights *weights) { + Qwen3vlModel *model = new Qwen3vlModel(_meta, weights); + return model; +} + +__C void +destroyQwen3vlModel(struct Qwen3vlModel *model) { + auto ndev = model->dev_resources.size(); + + for (size_t idev = 0; idev < ndev; idev++) { + std::unique_lock lock(model->states[idev].mtx); + model->states[idev].exit_flag = true; + lock.unlock(); + model->states[idev].cv_start.notify_one(); + } + + for (size_t idev = 0; idev < ndev; idev++) { + model->threads[idev].join(); + } + + delete model; +} diff --git a/src/models/qwen3vl/qwen3vl_cache.cpp b/src/models/qwen3vl/qwen3vl_cache.cpp new file mode 100644 index 00000000..b10b86c8 --- /dev/null +++ b/src/models/qwen3vl/qwen3vl_cache.cpp @@ -0,0 +1,43 @@ +#include "qwen3vl_impl.hpp" + +__C struct Qwen3vlCache * +createQwen3vlCache(const struct Qwen3vlModel *model) { + Qwen3vlCache *cache = new Qwen3vlCache(); + auto ndev = model->dev_resources.size(); + auto nlayer = model->meta.text_meta.num_hidden_layers; + auto max_len = model->meta.text_meta.max_tokens; + auto dh = model->meta.text_meta.head_dim; + auto nkv = model->meta.text_meta.num_key_value_heads / size_t(ndev); + auto k_rot_shape = std::vector{max_len, nkv, dh}; + auto v_shape = std::vector{max_len, nkv, dh}; + for (size_t idev = 0; idev < ndev; idev++) { + RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev])); + auto k_rot_cache = std::vector>(); + auto v_cache = std::vector>(); + for (size_t layer = 0; layer < nlayer; layer++) { + k_rot_cache.push_back(std::move(Tensor::buffer(model->meta.dtype, k_rot_shape))); + v_cache.push_back(std::move(Tensor::buffer(model->meta.dtype, v_shape))); + } + cache->k_rot.push_back(k_rot_cache); + cache->v.push_back(v_cache); + } + + return cache; +} + +//////还有visual deepstack需要cache? + +__C void +dropQwen3vlCache(const struct Qwen3vlModel *model, + struct Qwen3vlCache *cache) { + auto ndev = model->dev_resources.size(); + auto nlayer = model->meta.text_meta.num_hidden_layers; + for (size_t idev = 0; idev < ndev; idev++) { + RUN_INFINI(infinirtSetDevice(model->device, model->dev_ids[idev])); + for (size_t layer = 0; layer < nlayer; layer++) { + cache->k_rot[idev][layer].reset(); + cache->v[idev][layer].reset(); + } + } + delete cache; +} \ No newline at end of file diff --git a/src/models/qwen3vl/qwen3vl_impl.hpp b/src/models/qwen3vl/qwen3vl_impl.hpp new file mode 100644 index 00000000..76dd9d0d --- /dev/null +++ b/src/models/qwen3vl/qwen3vl_impl.hpp @@ -0,0 +1,143 @@ +#ifndef QWEN3VL_IMPL_H +#define QWEN3VL_IMPL_H + +#include "infinicore_infer.h" + +#include "../../allocator.hpp" +#include "../../tensor.hpp" + +#include +#include +#include +#include +#include + +struct Qwen3vlLayerWeight { + std::shared_ptr attn_norm; + std::shared_ptr attn_qkv_proj; + std::shared_ptr attn_q_norm; + std::shared_ptr attn_k_norm; + std::shared_ptr attn_o_proj; + + std::shared_ptr mlp_norm; + std::shared_ptr mlp_gate_up; + std::shared_ptr mlp_down; +}; + +struct Qwen3vlLanguageModelWeight { + std::shared_ptr in_embd, out_embd, out_norm; + std::vector layers; +}; + +struct Qwen3vlVisBlockWeight { + std::shared_ptr attn_proj_weight, attn_proj_bias, attn_qkv_weight, attn_qkv_bias; + std::shared_ptr mlp_linear_fc1_weight, mlp_linear_fc1_bias, mlp_linear_fc2_weight, mlp_linear_fc2_bias; + std::shared_ptr norm1_weight, norm1_bias, norm2_weight, norm2_bias; +}; + +struct DeepstackMergerWeight { + std::shared_ptr linear_fc1_weight, linear_fc1_bias, linear_fc2_weight, linear_fc2_bias; + std::shared_ptr norm_weight, norm_bias; +}; + +struct MergerWeight { + std::shared_ptr linear_fc1_weight, linear_fc1_bias, linear_fc2_weight, linear_fc2_bias; + std::shared_ptr norm_weight, norm_bias; +}; + + +struct Qwen3vlVisualEncoderWeight { + std::shared_ptr patch_embed_weight, patch_embed_bias, pos_embed_weight; + std::vector blocks; + std::vector deepstack_mergers; + std::shared_ptr merger; +}; + + +struct Qwen3vlDeviceWeights { + std::shared_ptr sin_table,cos_table; + std::shared_ptr w_lang; + std::shared_ptr w_vis; + infiniDevice_t device; + int dev_id; + infinirtStream_t load_stream; +}; + +struct Qwen3vlWeights { + Qwen3vlMeta const *meta; + bool transpose_weight; + std::vector> device_weights; + + Qwen3vlWeights(const Qwen3vlMeta *meta, + infiniDevice_t device, + int ndev, + const int *dev_ids, + bool transpose_weight); +}; + +struct Qwen3vlDeviceResource { + // Device + infiniDevice_t device; + int device_id; + infiniopHandle_t handle; + // Weights + std::shared_ptr weights; + // Streams + infinirtStream_t stream; + // Communicator + infinicclComm_t comm; + + std::shared_ptr memory_pool; +}; + +struct InferState { // qwen3vl namespace + inline static std::mutex mtx_sync; + inline static int sync_cnt; + inline static std::condition_variable cv_sync; + std::mutex mtx; + std::condition_variable cv_load, cv_start, cv_done; + bool loaded = false; + bool proceed = false; + bool exit_flag = false; +}; + +struct InferRequest { // qwen3vl namespace + const uint32_t *tokens; + uint32_t ntok; + void *pixel_values; + uint32_t total_patches; + uint32_t *image_grid_thw; + uint32_t num_images; + void *pixel_values_videos; + uint32_t total_patches_videos; + uint32_t *video_grid_thw; + uint32_t num_videos; + uint32_t patch_features; + const uint32_t *req_lens; + uint32_t nreq; + const uint32_t *req_pos; + struct Qwen3vlCache **kv_caches; + const float *temperature; + const uint32_t *topk; + const float *topp; + uint32_t *output; + void *logits; +}; + +struct Qwen3vlModel { + Qwen3vlMeta meta; + infiniDevice_t device; + std::vector dev_ids; + std::vector dev_resources; + std::vector states; + std::vector threads; + InferRequest req; + + Qwen3vlModel(const Qwen3vlMeta *, const Qwen3vlWeights *weights); +}; + +struct Qwen3vlCache { + std::vector>> k_rot, v; +}; + +#endif \ No newline at end of file diff --git a/src/models/qwen3vl/qwen3vl_weight.cpp b/src/models/qwen3vl/qwen3vl_weight.cpp new file mode 100644 index 00000000..ce9bbba5 --- /dev/null +++ b/src/models/qwen3vl/qwen3vl_weight.cpp @@ -0,0 +1,646 @@ +#include "qwen3vl_impl.hpp" + +#include + +inline std::shared_ptr getInEmbd( + const Qwen3vlMeta *meta) { + auto shape = std::vector({meta->text_meta.vocab_size, meta->text_meta.hidden_size}); + return Tensor::weight(nullptr, meta->dtype, shape); +} + +inline std::shared_ptr getOutNorm( + const Qwen3vlMeta *meta) { + auto shape = std::vector({meta->text_meta.hidden_size}); + return Tensor::weight(nullptr, meta->dtype, shape); +} + +inline std::shared_ptr getOutEmbd( + const Qwen3vlMeta *meta) { + + auto shape = std::vector({meta->text_meta.vocab_size, meta->text_meta.hidden_size}); + return Tensor::weight(nullptr, meta->dtype, shape) + ->permute({1, 0}); +} + +inline void getLayerWeight( + const Qwen3vlMeta *meta, Qwen3vlLayerWeight& layer, int ndev) { + auto nkvh = meta->text_meta.num_key_value_heads; + auto nh = meta->text_meta.num_attention_heads; + auto dh = meta->text_meta.head_dim; + auto d = meta->text_meta.hidden_size; + auto di = meta->text_meta.intermediate_size; + + auto dh_shape = std::vector({meta->text_meta.hidden_size}); + layer.attn_norm = Tensor::weight(nullptr, meta->dtype, dh_shape); + auto qk_norm_shape = std::vector({meta->text_meta.head_dim}); + layer.attn_q_norm = Tensor::weight(nullptr, meta->dtype, qk_norm_shape); + layer.attn_k_norm = Tensor::weight(nullptr, meta->dtype, qk_norm_shape); + auto qkv_proj_shape = std::vector({(nh + 2 * nkvh) / ndev * dh, d}); + layer.attn_qkv_proj = Tensor::weight(nullptr, meta->dtype, qkv_proj_shape); + auto o_proj_shape = std::vector({d, nh / ndev * dh}); + layer.attn_o_proj = Tensor::weight(nullptr, meta->dtype, o_proj_shape); + + layer.mlp_norm = Tensor::weight(nullptr, meta->dtype, dh_shape); + auto up_shape = std::vector({2 * di / ndev, d}); + layer.mlp_gate_up = Tensor::weight(nullptr, meta->dtype, up_shape); + auto down_shape = std::vector({d, di / ndev}); + layer.mlp_down = Tensor::weight(nullptr, meta->dtype, down_shape); +} + + +inline void getVisualWeight( + const Qwen3vlMeta *meta, std::shared_ptr w_vis) { + Qwen3vlVisMeta vis_meta = meta->vis_meta; + auto patch_embed_shape = std::vector({vis_meta.hidden_size , vis_meta.in_channels, vis_meta.temporal_patch_size, vis_meta.patch_size, vis_meta.patch_size}); + w_vis->patch_embed_weight = Tensor::weight(nullptr, meta->dtype, patch_embed_shape); + w_vis->patch_embed_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size}); + w_vis->pos_embed_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.num_position_embeddings, vis_meta.hidden_size}); + w_vis->merger = std::make_shared(); + w_vis->merger->linear_fc1_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size, vis_meta.intermediate_size}); + w_vis->merger->linear_fc2_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.out_hidden_size, vis_meta.intermediate_size}); + w_vis->merger->linear_fc1_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size}); + w_vis->merger->linear_fc2_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.out_hidden_size}); + w_vis->merger->norm_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size}); + w_vis->merger->norm_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size}); + w_vis->blocks = std::vector(vis_meta.depth); + for (size_t i = 0; i < vis_meta.depth; i++) { + w_vis->blocks[i].attn_proj_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size,vis_meta.hidden_size}); + w_vis->blocks[i].attn_proj_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size}); + w_vis->blocks[i].attn_qkv_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.in_channels*vis_meta.hidden_size,vis_meta.hidden_size}); + w_vis->blocks[i].attn_qkv_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.in_channels*vis_meta.hidden_size}); + w_vis->blocks[i].mlp_linear_fc1_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size, vis_meta.hidden_size}); + w_vis->blocks[i].mlp_linear_fc1_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size}); + w_vis->blocks[i].mlp_linear_fc2_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size, vis_meta.intermediate_size}); + w_vis->blocks[i].mlp_linear_fc2_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size}); + w_vis->blocks[i].norm1_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size}); + w_vis->blocks[i].norm1_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size}); + w_vis->blocks[i].norm2_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size}); + w_vis->blocks[i].norm2_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.hidden_size}); + } + w_vis->deepstack_mergers = std::vector(3); + for (size_t i = 0; i < 3; i++){ + w_vis->deepstack_mergers[i].linear_fc1_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size,vis_meta.intermediate_size}); + w_vis->deepstack_mergers[i].linear_fc2_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.out_hidden_size,vis_meta.intermediate_size}); + w_vis->deepstack_mergers[i].linear_fc1_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size}); + w_vis->deepstack_mergers[i].linear_fc2_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.out_hidden_size}); + w_vis->deepstack_mergers[i].norm_weight = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size}); + w_vis->deepstack_mergers[i].norm_bias = Tensor::weight(nullptr, meta->dtype, {vis_meta.intermediate_size}); + } + +} + + +inline std::shared_ptr getSinTable(const Qwen3vlMeta *meta) { + auto half_dh = meta->text_meta.head_dim / 2; + auto unit = dsize(meta->dtype); + void *table = std::malloc(meta->text_meta.max_tokens * half_dh * unit); + + for (size_t i = 0; i < meta->text_meta.max_tokens; i++) { + for (size_t j = 0; j < half_dh; j++) { + float _sin = std::sin( + static_cast(i) / std::pow(meta->text_meta.rope_theta, static_cast(j) / half_dh)); + if (meta->dtype == INFINI_DTYPE_F16) { + ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_sin); + } else if (meta->dtype == INFINI_DTYPE_BF16) { + ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(_sin); + } else if (meta->dtype == INFINI_DTYPE_F32) { + ((float *)table)[i * half_dh + j] = _sin; + } else { + std::cout << "unsupported data type" << std::endl; + exit(1); + } + } + } + auto shape = std::vector({meta->text_meta.max_tokens, half_dh}); + auto tensor = Tensor::weight(table, meta->dtype, shape); + std::free(table); + return tensor; +} + +inline std::shared_ptr getCosTable(const Qwen3vlMeta *meta) { + auto half_dh = meta->text_meta.head_dim / 2; + auto unit = dsize(meta->dtype); + void *table = std::malloc(meta->text_meta.max_tokens * half_dh * unit); + + for (size_t i = 0; i < meta->text_meta.max_tokens; i++) { + for (size_t j = 0; j < half_dh; j++) { + float _cos = std::cos( + static_cast(i) / std::pow(meta->text_meta.rope_theta, static_cast(j) / half_dh)); + if (meta->dtype == INFINI_DTYPE_F16) { + ((uint16_t *)table)[i * half_dh + j] = f32_to_f16(_cos); + } else if (meta->dtype == INFINI_DTYPE_BF16) { + ((uint16_t *)table)[i * half_dh + j] = f32_to_bf16(_cos); + } else if (meta->dtype == INFINI_DTYPE_F32) { + ((float *)table)[i * half_dh + j] = _cos; + } else { + std::cout << "unsupported data type" << std::endl; + exit(1); + } + } + } + auto shape = std::vector({meta->text_meta.max_tokens, half_dh}); + auto tensor = Tensor::weight(table, meta->dtype, shape); + std::free(table); + return tensor; +} + +Qwen3vlWeights::Qwen3vlWeights( + const Qwen3vlMeta *_meta, infiniDevice_t device, int ndev, const int *dev_ids, bool _transpose_weight) { + meta = _meta; + transpose_weight = _transpose_weight; + device_weights = std::vector>(ndev); + for (int dev = 0; dev < ndev; dev++) { + int dev_id = dev_ids[dev]; + RUN_INFINI(infinirtSetDevice(device, dev_id)); + device_weights[dev] = std::make_shared(); + device_weights[dev]->device = device; + device_weights[dev]->dev_id = dev_id; + RUN_INFINI(infinirtStreamCreate(&device_weights[dev]->load_stream)); + device_weights[dev]->w_lang = std::make_shared(); + device_weights[dev]->w_vis = std::make_shared(); + + device_weights[dev]->w_lang->in_embd = getInEmbd(meta); + device_weights[dev]->w_lang->out_norm = getOutNorm(meta); + device_weights[dev]->w_lang->out_embd = getOutEmbd(meta); + device_weights[dev]->sin_table = getSinTable(meta); + device_weights[dev]->cos_table = getCosTable(meta); + + device_weights[dev]->w_lang->layers = std::vector(meta->text_meta.num_hidden_layers); + + for (size_t layer = 0; layer < meta->text_meta.num_hidden_layers; layer++) { + getLayerWeight(meta, device_weights[dev]->w_lang->layers[layer], ndev); + } + + getVisualWeight(meta, device_weights[dev]->w_vis); + + } +} + +//--- Lang Global +void load_input_embd(Qwen3vlWeights *weights, void *cpu_ptr) { + std::cout << "Loading input embedding from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_lang->in_embd->load(cpu_ptr, weight->load_stream); + } +} + +void load_output_norm(Qwen3vlWeights *weights, void *cpu_ptr) { + std::cout << "Loading output norm from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_lang->out_norm->load(cpu_ptr, weight->load_stream); + } +} + +void load_output_embd(Qwen3vlWeights *weights, void *cpu_ptr) { + std::cout << "Loading output embedding from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_lang->out_embd->load(cpu_ptr, weight->load_stream); + if(weights->transpose_weight) { + weight->w_lang->out_embd->permute({1,0}); //[d,voc] + } + } +} + +// --- Attention +void load_attn_norm(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading attention norm " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_lang->layers[layer].attn_norm->load(cpu_ptr, weight->load_stream); + } +} + +void load_attn_q_norm(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading attention q_norm " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_lang->layers[layer].attn_q_norm->load(cpu_ptr, weight->load_stream); + } +} + +void load_attn_qkv_proj(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading attention q_proj " << layer << " from " << cpu_ptr << std::endl; + int ndev = int(weights->device_weights.size()); + auto nkvh = weights->meta->text_meta.num_key_value_heads; + auto nh = weights->meta->text_meta.num_attention_heads; + auto dh = weights->meta->text_meta.head_dim; + auto d = weights->meta->text_meta.hidden_size; + //[ndev,nh+2*nkvh,dh,d] + for (int idev = 0; idev < ndev; idev++) { + auto weight = weights->device_weights[idev]; + size_t offset = idev * ((nkvh * 2 + nh) / ndev * dh) * d * dsize(weights->meta->dtype); + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_lang->layers[layer].attn_qkv_proj->load((char *)cpu_ptr + offset, weight->load_stream); + if(weights->transpose_weight) { + weight->w_lang->layers[layer].attn_qkv_proj = + weight->w_lang->layers[layer].attn_qkv_proj->permute({1,0}); //[d, (nh+2*nkvh)*dh] + } + } +} + +void load_attn_k_norm(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading attention k_norm " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_lang->layers[layer].attn_k_norm->load(cpu_ptr, weight->load_stream); + } +} + +void load_attn_o_proj(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading attention o_proj " << layer << " from " << cpu_ptr << std::endl; + int ndev = int(weights->device_weights.size()); + auto nh = weights->meta->text_meta.num_attention_heads; + auto dh = weights->meta->text_meta.head_dim; + auto d = weights->meta->text_meta.hidden_size; + // [ndev, d, nh // ndev * dh] + for (int idev = 0; idev < ndev; idev++) { + auto weight = weights->device_weights[idev]; + size_t offset = idev * d * (nh / ndev * dh) * dsize(weights->meta->dtype); + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_lang->layers[layer].attn_o_proj->load((char *)cpu_ptr + offset, weight->load_stream); + if(weights->transpose_weight) { + weight->w_lang->layers[layer].attn_o_proj = + weight->w_lang->layers[layer].attn_o_proj->permute({1,0}); //[nh/ndev*dh, d] + } + } +} + +// --- MLP +void load_mlp_norm(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading mlp norm " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_lang->layers[layer].mlp_norm->load(cpu_ptr, weight->load_stream); + } +} + +void load_mlp_gate_up(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading mlp gate " << layer << " from " << cpu_ptr << std::endl; + int ndev = int(weights->device_weights.size()); + auto di = weights->meta->text_meta.intermediate_size; + auto d = weights->meta->text_meta.hidden_size; + // [ndev, 2*di // ndev, d] + for (int idev = 0; idev < ndev; idev++) { + auto weight = weights->device_weights[idev]; + size_t offset = idev * (2 * di / ndev) * d * dsize(weights->meta->dtype); + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_lang->layers[layer].mlp_gate_up->load((char *)cpu_ptr + offset, weight->load_stream); + if(weights->transpose_weight) { + weight->w_lang->layers[layer].mlp_gate_up = + weight->w_lang->layers[layer].mlp_gate_up->permute({1,0}); //[d, 2*di/ndev] + } + } +} + +void load_mlp_down(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading mlp down " << layer << " from " << cpu_ptr << std::endl; + int ndev = int(weights->device_weights.size()); + auto di = weights->meta->text_meta.intermediate_size; + auto d = weights->meta->text_meta.hidden_size; + //[ndev, d, di // ndev] + for (int idev = 0; idev < ndev; idev++) { + auto weight = weights->device_weights[idev]; + size_t offset = idev * d * (di / ndev) * dsize(weights->meta->dtype); + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_lang->layers[layer].mlp_down->load((char *)cpu_ptr + offset, weight->load_stream); + if(weights->transpose_weight) { + weight->w_lang->layers[layer].mlp_down = + weight->w_lang->layers[layer].mlp_down->permute({1,0}); //[di/ndev, d] + } + } +} + +// --- Vision weights +void load_patch_embed_weight(Qwen3vlWeights *weights, void *cpu_ptr) { + std::cout << "Loading patch embed weight from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->patch_embed_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_patch_embed_bias(Qwen3vlWeights *weights, void *cpu_ptr) { + std::cout << "Loading patch embed bias from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->patch_embed_bias->load(cpu_ptr, weight->load_stream); + } +} + +void load_pos_embed_weight(Qwen3vlWeights *weights, void *cpu_ptr) { + std::cout << "Loading pos embed weight from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->pos_embed_weight->load(cpu_ptr, weight->load_stream); + } +} + +// Vision block attention +void load_attn_proj_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading vision attn proj weight " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->blocks[layer].attn_proj_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_attn_proj_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading vision attn proj bias " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->blocks[layer].attn_proj_bias->load(cpu_ptr, weight->load_stream); + } +} + +void load_attn_qkv_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading vision attn qkv weight " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->blocks[layer].attn_qkv_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_attn_qkv_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading vision attn qkv bias " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->blocks[layer].attn_qkv_bias->load(cpu_ptr, weight->load_stream); + } +} + +// Vision block mlp +void load_mlp_linear_fc1_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading vision mlp fc1 weight " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->blocks[layer].mlp_linear_fc1_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_mlp_linear_fc1_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading vision mlp fc1 bias " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->blocks[layer].mlp_linear_fc1_bias->load(cpu_ptr, weight->load_stream); + } +} + +void load_mlp_linear_fc2_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading vision mlp fc2 weight " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->blocks[layer].mlp_linear_fc2_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_mlp_linear_fc2_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading vision mlp fc2 bias " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->blocks[layer].mlp_linear_fc2_bias->load(cpu_ptr, weight->load_stream); + } +} + +// Vision block norm +void load_norm1_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading vision norm1 weight " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->blocks[layer].norm1_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_norm1_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading vision norm1 bias " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->blocks[layer].norm1_bias->load(cpu_ptr, weight->load_stream); + } +} + +void load_norm2_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading vision norm2 weight " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->blocks[layer].norm2_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_norm2_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading vision norm2 bias " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->blocks[layer].norm2_bias->load(cpu_ptr, weight->load_stream); + } +} + +// Deepstack merger +void load_deepstack_merger_linear_fc1_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading deepstack merger fc1 weight " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->deepstack_mergers[layer].linear_fc1_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_deepstack_merger_linear_fc1_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading deepstack merger fc1 bias " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->deepstack_mergers[layer].linear_fc1_bias->load(cpu_ptr, weight->load_stream); + } +} + +void load_deepstack_merger_linear_fc2_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading deepstack merger fc2 weight " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->deepstack_mergers[layer].linear_fc2_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_deepstack_merger_linear_fc2_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading deepstack merger fc2 bias " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->deepstack_mergers[layer].linear_fc2_bias->load(cpu_ptr, weight->load_stream); + } +} + +void load_deepstack_merger_norm_weight(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading deepstack merger norm weight " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->deepstack_mergers[layer].norm_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_deepstack_merger_norm_bias(Qwen3vlWeights *weights, void *cpu_ptr, size_t layer) { + std::cout << "Loading deepstack merger norm bias " << layer << " from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->deepstack_mergers[layer].norm_bias->load(cpu_ptr, weight->load_stream); + } +} + +// Merger +void load_merger_linear_fc1_weight(Qwen3vlWeights *weights, void *cpu_ptr) { + std::cout << "Loading merger fc1 weight from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->merger->linear_fc1_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_merger_linear_fc1_bias(Qwen3vlWeights *weights, void *cpu_ptr) { + std::cout << "Loading merger fc1 bias from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->merger->linear_fc1_bias->load(cpu_ptr, weight->load_stream); + } +} + +void load_merger_linear_fc2_weight(Qwen3vlWeights *weights, void *cpu_ptr) { + std::cout << "Loading merger fc2 weight from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->merger->linear_fc2_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_merger_linear_fc2_bias(Qwen3vlWeights *weights, void *cpu_ptr) { + std::cout << "Loading merger fc2 bias from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->merger->linear_fc2_bias->load(cpu_ptr, weight->load_stream); + } +} + +void load_merger_norm_weight(Qwen3vlWeights *weights, void *cpu_ptr) { + std::cout << "Loading merger norm weight from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->merger->norm_weight->load(cpu_ptr, weight->load_stream); + } +} + +void load_merger_norm_bias(Qwen3vlWeights *weights, void *cpu_ptr) { + std::cout << "Loading merger norm bias from " << cpu_ptr << std::endl; + for (int dev = 0; dev < int(weights->device_weights.size()); dev++) { + auto weight = weights->device_weights[dev]; + RUN_INFINI(infinirtSetDevice(weight->device, weight->dev_id)); + weight->w_vis->merger->norm_bias->load(cpu_ptr, weight->load_stream); + } +} + + +static Qwen3vlWeightLoader weight_loader = { + // Language model loaders + .lang_loader = { + .load_input_embd = load_input_embd, + .load_output_norm = load_output_norm, + .load_output_embd = load_output_embd, + .load_attn_norm = load_attn_norm, + .load_attn_q_norm = load_attn_q_norm, + .load_attn_k_norm = load_attn_k_norm, + .load_attn_qkv_proj = load_attn_qkv_proj, + .load_attn_o_proj = load_attn_o_proj, + .load_mlp_norm = load_mlp_norm, + .load_mlp_gate_up = load_mlp_gate_up, + .load_mlp_down = load_mlp_down, + }, + // Vision model loaders + .vis_loader = { + .load_patch_embed_weight = load_patch_embed_weight, + .load_patch_embed_bias = load_patch_embed_bias, + .load_pos_embed_weight = load_pos_embed_weight, + .load_attn_proj_weight = load_attn_proj_weight, + .load_attn_proj_bias = load_attn_proj_bias, + .load_attn_qkv_weight = load_attn_qkv_weight, + .load_attn_qkv_bias = load_attn_qkv_bias, + .load_mlp_linear_fc1_weight = load_mlp_linear_fc1_weight, + .load_mlp_linear_fc1_bias = load_mlp_linear_fc1_bias, + .load_mlp_linear_fc2_weight = load_mlp_linear_fc2_weight, + .load_mlp_linear_fc2_bias = load_mlp_linear_fc2_bias, + .load_norm1_weight = load_norm1_weight, + .load_norm1_bias = load_norm1_bias, + .load_norm2_weight = load_norm2_weight, + .load_norm2_bias = load_norm2_bias, + .load_deepstack_merger_linear_fc1_weight = load_deepstack_merger_linear_fc1_weight, + .load_deepstack_merger_linear_fc1_bias = load_deepstack_merger_linear_fc1_bias, + .load_deepstack_merger_linear_fc2_weight = load_deepstack_merger_linear_fc2_weight, + .load_deepstack_merger_linear_fc2_bias = load_deepstack_merger_linear_fc2_bias, + .load_deepstack_merger_norm_weight = load_deepstack_merger_norm_weight, + .load_deepstack_merger_norm_bias = load_deepstack_merger_norm_bias, + .load_merger_linear_fc1_weight = load_merger_linear_fc1_weight, + .load_merger_linear_fc1_bias = load_merger_linear_fc1_bias, + .load_merger_linear_fc2_weight = load_merger_linear_fc2_weight, + .load_merger_linear_fc2_bias = load_merger_linear_fc2_bias, + .load_merger_norm_weight = load_merger_norm_weight, + .load_merger_norm_bias = load_merger_norm_bias, + } +}; + +__C Qwen3vlWeights * +createQwen3vlWeights(const Qwen3vlMeta *meta, + infiniDevice_t device, + int ndev, + const int *dev_ids, + bool transpose_weight) { + + printf("=== C++ createQwen3vlWeights ===\n"); + printf("sizeof(Qwen3vlTextMeta): %zu\n", sizeof(Qwen3vlTextMeta)); + printf("sizeof(Qwen3vlVisMeta): %zu\n", sizeof(Qwen3vlVisMeta)); + printf("sizeof(Qwen3vlMeta): %zu\n", sizeof(Qwen3vlMeta)); + printf("meta->dtype: %d\n", meta->dtype); + printf("meta->text_meta.hidden_size: %zu\n", meta->text_meta.hidden_size); + printf("meta->text_meta.num_hidden_layers: %zu\n", meta->text_meta.num_hidden_layers); + printf("meta->text_meta.vocab_size: %zu\n", meta->text_meta.vocab_size); + printf("meta->vis_meta.depth: %zu\n", meta->vis_meta.depth); + printf("device: %d, ndev: %d, dev_ids[0]: %d\n", device, ndev, dev_ids[0]); + fflush(stdout); + + auto weights = new Qwen3vlWeights(meta, device, ndev, dev_ids, transpose_weight); + return weights; +}; + +__C Qwen3vlWeightLoader * +createQwen3vlWeightLoader() { + return &weight_loader; +} \ No newline at end of file diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp index edf0faeb..37d8712a 100644 --- a/src/tensor/tensor.cpp +++ b/src/tensor/tensor.cpp @@ -267,7 +267,7 @@ void print_data_bf16(uint16_t const *data, const std::vector &shape, std::cout << std::endl; } else if (dim < shape.size() - 1) { for (size_t i = 0; i < shape[dim]; i++) { - print_data(data + i * strides[dim], shape, strides, dim + 1); + print_data_bf16(data + i * strides[dim], shape, strides, dim + 1); } } } diff --git a/src/utils.hpp b/src/utils.hpp index b0da9fff..17b35628 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -7,9 +7,37 @@ #include #include +#ifdef __linux__ +#include +#include + +inline void printStackTrace() { + void *buffer[100]; + int nptrs = backtrace(buffer, 100); + char **strings = backtrace_symbols(buffer, nptrs); + + if (strings == nullptr) { + perror("backtrace_symbols"); + return; + } + + fprintf(stderr, "Stack trace:\n"); + for (int i = 0; i < nptrs; i++) { + fprintf(stderr, "%s\n", strings[i]); + } + free(strings); +} +#else +// 在非Linux系统上的备用实现 +inline void printStackTrace() { + fprintf(stderr, "Stack trace not available on this platform\n"); +} +#endif + inline void assertTrue(int expr, const char *msg, const char *file, int line) { if (!expr) { fprintf(stderr, "\033[31mAssertion failed:\033[0m %s at file %s, line %d\n", msg, file, line); + printStackTrace(); exit(EXIT_FAILURE); } } @@ -20,6 +48,7 @@ inline void assertTrue(int expr, const char *msg, const char *file, int line) { #define PANIC(EXPR) \ printf("Error at %s:%d - %s\n", __FILE__, __LINE__, #EXPR); \ + printStackTrace(); \ exit(EXIT_FAILURE) #define RUN_INFINI(API) \ @@ -29,6 +58,7 @@ inline void assertTrue(int expr, const char *msg, const char *file, int line) { std::cerr << "Error Code " << api_result_ << " in `" << #API << "`" \ << " from " << __func__ \ << " at " << __FILE__ << ":" << __LINE__ << std::endl; \ + printStackTrace(); \ exit(EXIT_FAILURE); \ } \ } while (0) @@ -38,21 +68,26 @@ inline float f16_to_f32(uint16_t h) { int32_t exponent = (h >> 10) & 0x1F; // Extract the exponent uint32_t mantissa = h & 0x3FF; // Extract the mantissa (fraction part) + union { + uint32_t int_value; + float float_value; + } converter; + if (exponent == 31) { // Special case for Inf and NaN if (mantissa != 0) { // NaN: Set float32 NaN - uint32_t f32 = sign | 0x7F800000 | (mantissa << 13); - return *(float *)&f32; + converter.int_value = sign | 0x7F800000 | (mantissa << 13); + return converter.float_value; } else { // Infinity - uint32_t f32 = sign | 0x7F800000; - return *(float *)&f32; + converter.int_value = sign | 0x7F800000; + return converter.float_value; } } else if (exponent == 0) { // Subnormal float16 or zero if (mantissa == 0) { // Zero (positive or negative) - uint32_t f32 = sign; // Just return signed zero - return *(float *)&f32; + converter.int_value = sign; // Just return signed zero + return converter.float_value; } else { // Subnormal: Convert to normalized float32 exponent = -14; // Set exponent for subnormal numbers @@ -61,13 +96,13 @@ inline float f16_to_f32(uint16_t h) { exponent--; } mantissa &= 0x3FF; // Clear the leading 1 bit - uint32_t f32 = sign | ((exponent + 127) << 23) | (mantissa << 13); - return *(float *)&f32; + converter.int_value = sign | ((exponent + 127) << 23) | (mantissa << 13); + return converter.float_value; } } else { // Normalized float16 - uint32_t f32 = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); - return *(float *)&f32; + converter.int_value = sign | ((exponent + 127 - 15) << 23) | (mantissa << 13); + return converter.float_value; } } diff --git a/t012ed7ed15c1fafc48.jpg b/t012ed7ed15c1fafc48.jpg new file mode 100644 index 00000000..4c762f67 Binary files /dev/null and b/t012ed7ed15c1fafc48.jpg differ diff --git a/wget-log b/wget-log new file mode 100644 index 00000000..5560a8d7 --- /dev/null +++ b/wget-log @@ -0,0 +1,11 @@ +--2026-01-06 16:54:07-- https://ts4.tc.mm.bing.net/th/id/OIP-C.JzdgdK0950bF1_jTmKY46wHaEo?rs=1 +Resolving ts4.tc.mm.bing.net (ts4.tc.mm.bing.net)... 116.196.150.210, 116.196.152.184 +Connecting to ts4.tc.mm.bing.net (ts4.tc.mm.bing.net)|116.196.150.210|:443... connected. +HTTP request sent, awaiting response... 200 OK +Length: 25309 (25K) [image/jpeg] +Saving to: ‘OIP-C.JzdgdK0950bF1_jTmKY46wHaEo?rs=1’ + + OIP-C.JzdgdK0950bF1_jTmKY46wHaEo?rs=1 0%[ ] 0 --.-KB/s OIP-C.JzdgdK0950bF1_jTmKY46wHaEo?rs=1 100%[=================================================================================================================>] 24.72K --.-KB/s in 0.009s + +2026-01-06 16:54:07 (2.82 MB/s) - ‘OIP-C.JzdgdK0950bF1_jTmKY46wHaEo?rs=1’ saved [25309/25309] + diff --git a/xmake.lua b/xmake.lua index 598ac534..51ded2fe 100644 --- a/xmake.lua +++ b/xmake.lua @@ -1,8 +1,15 @@ local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini") +add_rules("mode.debug") + target("infinicore_infer") set_kind("shared") + if is_mode("debug") then + add_ldflags("-rdynamic", "-g") --调用栈中显示函数名 + add_cxxflags("-g", "-O0", "-fno-omit-frame-pointer") --获得最佳调试信息 + end + add_includedirs("include", { public = false }) add_includedirs(INFINI_ROOT.."/include", { public = true })