Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added 010P00002405F02D94-1.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 4 additions & 0 deletions include/infinicore_infer.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
#include "infinicore_infer/cache.h"
#include "infinicore_infer/weights_loader.h"


#include "infinicore_infer/models/deepseek.h"
#include "infinicore_infer/models/jiuge.h"
#include "infinicore_infer/models/jiuge_awq.h"
#include "infinicore_infer/models/qwen3vl.h"


#endif /* INFINICORE_INFER_H */
203 changes: 203 additions & 0 deletions include/infinicore_infer/models/qwen3vl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#ifndef QWEN3VL_WEIGHTS_H
#define QWEN3VL_WEIGHTS_H

#include <infiniccl.h>
#include <infiniop.h>
#include <infinirt.h>

#include <stddef.h>
#include <stdint.h>

struct Qwen3vlWeights;

// Function pointer signatures
typedef void (*qwen3vl_load_global_fn)(Qwen3vlWeights *, void *cpu_ptr);
typedef void (*qwen3vl_load_layer_fn)(Qwen3vlWeights *, void *cpu_ptr, size_t layer_id);
// Struct containing all weight loading functions
typedef struct {
// Global
qwen3vl_load_global_fn load_input_embd;
qwen3vl_load_global_fn load_output_norm;
qwen3vl_load_global_fn load_output_embd;

// Attention
qwen3vl_load_layer_fn load_attn_norm;
qwen3vl_load_layer_fn load_attn_q_norm;
qwen3vl_load_layer_fn load_attn_k_norm;
qwen3vl_load_layer_fn load_attn_qkv_proj;
qwen3vl_load_layer_fn load_attn_o_proj;

// MLP
qwen3vl_load_layer_fn load_mlp_norm;
qwen3vl_load_layer_fn load_mlp_gate_up;
qwen3vl_load_layer_fn load_mlp_down;

} Qwen3vlLangWeightLoader;

typedef struct {
// Patch_embed
qwen3vl_load_global_fn load_patch_embed_weight;
qwen3vl_load_global_fn load_patch_embed_bias;
qwen3vl_load_global_fn load_pos_embed_weight;

// blocks attn
qwen3vl_load_layer_fn load_attn_proj_weight;
qwen3vl_load_layer_fn load_attn_proj_bias;
qwen3vl_load_layer_fn load_attn_qkv_weight;
qwen3vl_load_layer_fn load_attn_qkv_bias;

//block mlp
qwen3vl_load_layer_fn load_mlp_linear_fc1_weight;
qwen3vl_load_layer_fn load_mlp_linear_fc1_bias;
qwen3vl_load_layer_fn load_mlp_linear_fc2_weight;
qwen3vl_load_layer_fn load_mlp_linear_fc2_bias;

//block norm
qwen3vl_load_layer_fn load_norm1_weight;
qwen3vl_load_layer_fn load_norm1_bias;
qwen3vl_load_layer_fn load_norm2_weight;
qwen3vl_load_layer_fn load_norm2_bias;

//deepstack_merger
qwen3vl_load_layer_fn load_deepstack_merger_linear_fc1_weight;
qwen3vl_load_layer_fn load_deepstack_merger_linear_fc1_bias;
qwen3vl_load_layer_fn load_deepstack_merger_linear_fc2_weight;
qwen3vl_load_layer_fn load_deepstack_merger_linear_fc2_bias;
qwen3vl_load_layer_fn load_deepstack_merger_norm_weight;
qwen3vl_load_layer_fn load_deepstack_merger_norm_bias;

//merger
qwen3vl_load_global_fn load_merger_linear_fc1_weight;
qwen3vl_load_global_fn load_merger_linear_fc1_bias;
qwen3vl_load_global_fn load_merger_linear_fc2_weight;
qwen3vl_load_global_fn load_merger_linear_fc2_bias;
qwen3vl_load_global_fn load_merger_norm_weight;
qwen3vl_load_global_fn load_merger_norm_bias;

} Qwen3vlVisWeightLoader;

typedef struct {
Qwen3vlLangWeightLoader lang_loader;
Qwen3vlVisWeightLoader vis_loader;
} Qwen3vlWeightLoader;

struct Qwen3vlModel;

typedef struct {
size_t bos_token_id;
size_t eos_token_id;
size_t head_dim;
size_t hidden_size;
float initializer_range;
size_t intermediate_size;
size_t max_tokens;
size_t num_attention_heads;
size_t num_hidden_layers;
size_t num_key_value_heads;
float rms_norm_eps;
size_t mrope_section[3];
size_t rope_theta;
size_t vocab_size;
} Qwen3vlTextMeta;

typedef struct {
size_t depth;
size_t deepstack_visual_indexes[3];
size_t hidden_size;
size_t in_channels;
float initializer_range;
size_t intermediate_size;
size_t num_heads;
size_t num_position_embeddings;
size_t out_hidden_size;
size_t patch_size;
size_t spatial_merge_size;
size_t temporal_patch_size;
} Qwen3vlVisMeta;

typedef struct {
infiniDtype_t dtype; //INFINI_DTYPE_BF16

Qwen3vlTextMeta text_meta;
Qwen3vlVisMeta vis_meta;

size_t image_token_id;
size_t video_token_id;
size_t vision_end_token_id;
size_t vision_start_token_id;
} Qwen3vlMeta;

//////////////////// APIs ///////////////////////
/// @brief 创建模型
/// @param device 协处理器种类
/// @param ndev 协处理器数量
/// @param dev_ids 协处理器编号,长度为 ndev
__C __export struct Qwen3vlModel *
createQwen3vlModel(const Qwen3vlMeta *,
const Qwen3vlWeights *);

__C Qwen3vlWeights *
createQwen3vlWeights(const Qwen3vlMeta *meta,
infiniDevice_t device,
int ndev,
const int *dev_ids,
bool transpose_weight);

__C __export Qwen3vlWeightLoader *
createQwen3vlWeightLoader();

/// @brief 销毁模型
__C __export void destroyQwen3vlModel(struct Qwen3vlModel *);

__C __export struct Qwen3vlCache *
createQwen3vlCache(const struct Qwen3vlModel *);

__C __export void
dropQwen3vlCache(const struct Qwen3vlModel *,
struct Qwen3vlCache *);

/// @brief 批次推理一轮,并采样出新的 token
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param temperature 采样温度(0. 表示贪心采样)
/// @param topk 采样 topk(1 表示贪心采样)
/// @param topp 采样 topp
/// @param output 输出 token 数组,每个请求一个输出,长度至少为nreq
__C __export void
inferBatchQwen3vl(struct Qwen3vlModel *,
const uint32_t *tokens, uint32_t ntok,
void *pixel_values, uint32_t total_patches,
uint32_t *image_grid_thw, uint32_t num_images,
void *pixel_values_videos, uint32_t total_patches_videos,
uint32_t *video_grid_thw, uint32_t num_videos,
uint32_t patch_features,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct Qwen3vlCache **caches,
const float *temperature, const uint32_t *topk, const float *topp,
uint32_t *output);

/// @brief 批次推理一轮,输出 output embedding 后的 logits
/// @param tokens 输入 token 地址
/// @param ntok 输入 token 数量
/// @param nreq 请求数量
/// @param req_lens 每个请求的 token 数量
/// @param req_pos 每个请求的起始位置
/// @param kv_caches 每个请求的 KV Cache
/// @param logits 输出 token 数组,每个请求一个输出,长度至少为nreq
__C __export void
forwardBatchQwen3vl(struct Qwen3vlModel *,
const uint32_t *tokens, uint32_t ntok,
void *pixel_values, uint32_t total_patches,
uint32_t *image_grid_thw, uint32_t num_images,
void *pixel_values_videos, uint32_t total_patches_videos,
uint32_t *video_grid_thw, uint32_t num_videos,
uint32_t patch_features,
const uint32_t *req_lens, uint32_t nreq, const uint32_t *req_pos,
struct Qwen3vlCache **caches,
void *logits);

#endif // QWEN3VL_WEIGHTS_H
27 changes: 27 additions & 0 deletions qwen3vl_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash
#SBATCH --job-name=test_job # 任务名
#SBATCH --output=output_%j.log # 标准输出文件(%j 会替换成 job ID)
#SBATCH --error=error_%j.log # 标准错误输出文件
#SBATCH --partition=nvidia # 分区名(机器系统默认分区是 nvidia)
#SBATCH --nodes=1 # 需要的节点数
#SBATCH --ntasks=1 # 总任务数(通常 = 节点数 × 每节点任务数)
#SBATCH --cpus-per-task=8 # 每个任务需要的 CPU 核心数
#SBATCH --gres=gpu:nvidia:4 # 请求 4 块 GPU(nvidia 是 Gres 类型)
#SBATCH --mem=32G # 请求的内存

# 需要用到计算资源的命令
# 推荐使用 srun 启动主程序,自动绑定资源
source /data/apps/env.sh
source /data/apps/miniforge3/etc/profile.d/conda.sh
conda activate py313
export INFINI_ROOT=$HOME/.infini
export LD_LIBRARY_PATH=$INFINI_ROOT/lib:$LD_LIBRARY_PATH
export PATH="/data/apps/xmake/bin:/usr/local/cuda/bin:$PATH"

export PYTHONPATH=$HOME/InfiniLM/scripts:$PYTHONPATH

cd $HOME/InfiniLM

#srun python scripts/qwen3vl_test.py
#srun python scripts/qwen3vl.py --nvidia /data/shared/models/Qwen3-VL-2B-Instruct
srun python scripts/launch_server.py --model-path /data/shared/models/Qwen3-VL-2B-Instruct --dev nvidia --ndev 4
5 changes: 3 additions & 2 deletions scripts/infer_task.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
class InferTask:
def __init__(self, id, tokens, max_tokens, temperature, topk, topp, end_tokens):
def __init__(self, id, inputs, max_tokens, temperature, topk, topp, end_tokens):
self.id = id
self.finish_reason = None
self.tokens = tokens
self.inputs = inputs
self.tokens = inputs['input_ids'][0].tolist()
self.max_tokens = max_tokens
self.temperature = temperature
self.topk = topk
Expand Down
45 changes: 19 additions & 26 deletions scripts/launch_server.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from jiuge import JiugeForCauslLM
from jiuge_awq import JiugeAWQForCausalLM
from qwen3vl import Qwen3vlForCauslLM
from libinfinicore_infer import DeviceType
from infer_task import InferTask
from kvcache_pool import KVCachePool
Expand Down Expand Up @@ -60,14 +59,9 @@ def parse_args():
"--max-tokens",
type=int,
required=False,
default=None,
default=200,
help="Max token sequence length that model will handle (follows model config if not provided)",
)
parser.add_argument(
"--awq",
action="store_true",
help="Whether to use AWQ quantized model (default: False)",
)
return parser.parse_args()


Expand All @@ -76,7 +70,6 @@ def parse_args():
model_path = args.model_path
ndev = args.ndev
max_tokens = args.max_tokens
USE_AWQ = args.awq
MAX_BATCH = args.max_batch
print(
f"Using MAX_BATCH={MAX_BATCH}. Try reduce this value if out of memory error occurs."
Expand All @@ -93,7 +86,7 @@ def chunk_json(id_, content=None, role=None, finish_reason=None):
"id": id_,
"object": "chat.completion.chunk",
"created": int(time.time()),
"model": "jiuge",
"model": "qwen3vl",
"system_fingerprint": None,
"choices": [
{
Expand All @@ -109,8 +102,8 @@ def chunk_json(id_, content=None, role=None, finish_reason=None):

# A wrapper for InferTask that supports async output queue
class AsyncInferTask(InferTask):
def __init__(self, id, tokens, max_tokens, temperature, topk, topp, end_tokens):
super().__init__(id, tokens, max_tokens, temperature, topk, topp, end_tokens)
def __init__(self, id, inputs, max_tokens, temperature, topk, topp, end_tokens):
super().__init__(id, inputs, max_tokens, temperature, topk, topp, end_tokens)
self.output_queue = janus.Queue()
print(f"[INFO] Create InferTask {self.id}")

Expand All @@ -122,14 +115,9 @@ def output(self, out_token):
@contextlib.asynccontextmanager
async def lifespan(app: FastAPI):
# Startup
if USE_AWQ:
app.state.model = JiugeAWQForCausalLM(
model_path, device_type, ndev, max_tokens=max_tokens
)
else:
app.state.model = JiugeForCauslLM(
model_path, device_type, ndev, max_tokens=max_tokens
)
app.state.model = Qwen3vlForCauslLM(
model_path, device_type, ndev, max_tokens=max_tokens
)
app.state.kv_cache_pool = KVCachePool(app.state.model, MAX_BATCH)
app.state.request_queue = janus.Queue()
worker_thread = threading.Thread(target=worker_loop, args=(app,), daemon=True)
Expand Down Expand Up @@ -169,6 +157,8 @@ def worker_loop(app):
batch.append(req)
except queue.Empty:
break

print(f"infering {len(batch)} tasks")
output_tokens = app.state.model.batch_infer_one_round(batch)
for task, token in zip(batch, output_tokens):
task.output(token)
Expand All @@ -181,15 +171,18 @@ def worker_loop(app):

def build_task(id_, request_data, request: Request):
messages = request_data.get("messages", [])
input_content = request.app.state.model.tokenizer.apply_chat_template(
conversation=messages,
inputs = request.app.state.model.processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
tokenize=False,
return_dict=True,
return_tensors="pt",
)
tokens = request.app.state.model.tokenizer.encode(input_content)
inputs.pop("token_type_ids", None)

return AsyncInferTask(
id_,
tokens,
inputs,
request_data.get("max_tokens", request.app.state.model.max_context_len()),
request_data.get("temperature", 1.0),
request_data.get("top_k", 1),
Expand Down Expand Up @@ -298,7 +291,7 @@ async def chat_completions(request: Request):


if __name__ == "__main__":
uvicorn.run(App, host="0.0.0.0", port=8000)
uvicorn.run(App, host="0.0.0.0", port=8008)

"""
curl -N -H "Content-Type: application/json" \
Expand Down
Loading