MixQ_Tensorrt_LLM/fp16_mpi.sh at main · Qcompiler/MixQ_Tensorrt_LLM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38


set -x
CMD="srun  -N 1 --pty --gres=gpu:a100:1 -p octave -A public  "
models=("Llama-2-70b")
type='fp16'


for model in "${models[@]}"
    do

    model_dir=/dataset/${model}
    output_dir=/octave/checkpoint/checkpoint${type}/tllm_checkpoint_1gpu_fp16${model}
    engine_dir=/octave/checkpoint/trt_engines${type}/tllm_checkpoint_1gpu_fp16${model}

    CUDA_VISIBLE_DEVICES=$1   \
    python  quantize.py --model_dir  ${model_dir}\
    --output_dir ${output_dir}   --dtype float16   --pp_size 4 --load_model_on_cpu

    CUDA_VISIBLE_DEVICES=$1 trtllm-build --checkpoint_dir ${output_dir} \
            --output_dir  ${engine_dir} \
            --gemm_plugin float16

    CUDA_VISIBLE_DEVICES=$1    mpirun -n 4 --allow-run-as-root python  summarize.py --test_trt_llm \
            --hf_model_dir ${model_dir} \
            --data_type fp16 \
            --engine_dir ${engine_dir}
#     CUDA_VISIBLE_DEVICES=4,5,6,7 mpirun -n 4 --allow-run-as-root \
#     python  run.py \
#     --max_output_len 128 \
#     --max_input_length 10240 \
#     --input_file pg64317_sanitized.txt \
#     --engine_dir ${engine_dir} \
#     --tokenizer_dir  ${model_dir}

done