File tree Expand file tree Collapse file tree 1 file changed +26
-0
lines changed
Expand file tree Collapse file tree 1 file changed +26
-0
lines changed Original file line number Diff line number Diff line change @@ -158,3 +158,29 @@ yaml 配置文件
158158 --adapter_name_or_path saves/Qwen1.5-7B/lora/sft \
159159 --template qwen \
160160 --finetuning_type lora
161+
162+ 也可以使用vllm-ascend进行推理加速:
163+
164+ .. note ::
165+ 先安装vllm-ascend,见`vllm-ascend 官方安装指南:<https://vllm-ascend.readthedocs.io/en/latest/installation.html>`
166+
167+ .. code-block :: shell
168+
169+ # use modelscope
170+ export USE_MODELSCOPE_HUB=1
171+
172+ # specify NPU
173+ export ASCEND_RT_VISIBLE_DEVICES=0
174+
175+ # Set `max_split_size_mb` to reduce memory fragmentation and avoid out of memory
176+ export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256
177+
178+ # Since the vllm service is started by pulling up a child process, you need to use the spawn method to create a vllm-serve process
179+ export VLLM_WORKER_MULTIPROC_METHOD=spawn
180+
181+ # ## inference -- chat
182+ llamafactory-cli chat --model_name_or_path qwen/Qwen1.5-7B \
183+ --adapter_name_or_path saves/Qwen1.5-7B/lora/sft \
184+ --template qwen \
185+ --finetuning_type lora\
186+ --infer_backend vllm
You can’t perform that action at this time.
0 commit comments