diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py index b21ded4bc4..e296c4d760 100644 --- a/.github/scripts/action_tools.py +++ b/.github/scripts/action_tools.py @@ -7,14 +7,13 @@ import subprocess import time from collections import OrderedDict -from typing import List import fire import pandas as pd from mmengine.config import Config -def run_cmd(cmd_lines: List[str], log_path: str, cwd: str = None): +def run_cmd(cmd_lines: list[str], log_path: str, cwd: str = None): """ Args: cmd_lines: (list[str]): A command in multiple line style. @@ -43,7 +42,7 @@ def run_cmd(cmd_lines: List[str], log_path: str, cwd: str = None): if return_code != 0: logging.error(f'Got shell abnormal return code={return_code}') - with open(log_path, 'r') as f: + with open(log_path) as f: content = f.read() logging.error(f'Log error message\n{content}') return return_code @@ -61,7 +60,7 @@ def add_summary(csv_path: str): Args: csv_path (str): Input csv file. """ - with open(csv_path, 'r') as fr: + with open(csv_path) as fr: lines = fr.readlines() header = lines[0].strip().split(',') n_col = len(header) @@ -75,8 +74,8 @@ def add_summary(csv_path: str): _append_summary('\n') -def evaluate(models: List[str], - datasets: List[str], +def evaluate(models: list[str], + datasets: list[str], workspace: str, evaluate_type: str, max_num_workers: int = 8, @@ -146,12 +145,12 @@ def evaluate(models: List[str], # print csv_txt to screen csv_txt = csv_file.replace('.csv', '.txt') if os.path.exists(csv_txt): - with open(csv_txt, 'r') as f: + with open(csv_txt) as f: print(f.read()) # parse evaluation results from csv file model_results = OrderedDict() - with open(csv_file, 'r') as f: + with open(csv_file) as f: lines = f.readlines() for line in lines[1:]: row = line.strip().split(',') @@ -160,7 +159,7 @@ def evaluate(models: List[str], model_results[row[0]] = row[-1] crows_pairs_json = glob.glob(os.path.join(work_dir, '*/results/*/crows_pairs.json'), recursive=True) if len(crows_pairs_json) == 1: - with open(crows_pairs_json[0], 'r') as f: + with open(crows_pairs_json[0]) as f: acc = json.load(f)['accuracy'] acc = f'{float(acc):.2f}' # noqa E231 model_results['crows_pairs'] = acc @@ -238,9 +237,9 @@ def generate_benchmark_report(report_path: str): grouped_df = merged_df.groupby(merged_df.columns[0]) if 'generation' not in backend_subfolder: - average_values = grouped_df.pipe((lambda group: { + average_values = grouped_df.pipe(lambda group: { 'mean': group.mean(numeric_only=True).round(decimals=3) - }))['mean'] + })['mean'] average_values.to_csv(average_csv_path, index=True) avg_df = pd.read_csv(average_csv_path) merged_df = pd.concat([merged_df, avg_df], ignore_index=True) @@ -253,7 +252,7 @@ def generate_benchmark_report(report_path: str): def generate_csv_from_profile_result(file_path: str, out_path: str): - with open(file_path, 'r') as f: + with open(file_path) as f: data = f.readlines() data = [json.loads(line) for line in data] diff --git a/.github/scripts/doc_link_checker.py b/.github/scripts/doc_link_checker.py index 8858b414dc..2b20d00f07 100644 --- a/.github/scripts/doc_link_checker.py +++ b/.github/scripts/doc_link_checker.py @@ -17,7 +17,7 @@ def make_parser(): def analyze_doc(home, path): - print('analyze {}'.format(path)) + print(f'analyze {path}') problem_list = [] code_block = 0 with open(path) as f: diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py index 25e374639d..9b966f3b7c 100644 --- a/.github/scripts/eval_base_config.py +++ b/.github/scripts/eval_base_config.py @@ -11,39 +11,51 @@ from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import cmmlu_datasets # noqa: F401, E501 from opencompass.configs.datasets.crowspairs.crowspairs_ppl import crowspairs_datasets # noqa: F401, E501 from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets # noqa: F401, E501 + # Corebench v1.7 - from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \ - GaokaoBench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import ( + GaokaoBench_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import gpqa_datasets # noqa: F401, E501 from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets # noqa: F401, E501 - from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \ - hellaswag_datasets # noqa: F401, E501 - from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \ - humaneval_datasets as humaneval_v2_datasets # noqa: F401, E501 - from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \ - humaneval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import ( + hellaswag_datasets, # noqa: F401, E501 + ) + from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import ( + humaneval_datasets as humaneval_v2_datasets, # noqa: F401, E501 + ) + from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import ( + humaneval_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import math_datasets # noqa: F401, E501 - from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \ - mathbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import ( + mathbench_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import sanitized_mbpp_datasets # noqa: F401, E501 from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets # noqa: F401, E501 from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import mmlu_pro_datasets # noqa: F401, E501 from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import nq_datasets # noqa: F401, E501 from opencompass.configs.datasets.race.race_few_shot_ppl import race_datasets # noqa: F401, E501 - from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \ - BoolQ_datasets # noqa: F401, E501 + from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import ( + BoolQ_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets # noqa: F401, E501 - from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \ - triviaqa_datasets # noqa: F401, E501 - from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \ - wikibench_datasets # noqa: F401, E501 - from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ - winogrande_datasets # noqa: F401, E501 + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import ( + triviaqa_datasets, # noqa: F401, E501 + ) + from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import ( + wikibench_datasets, # noqa: F401, E501 + ) + from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import ( + winogrande_datasets, # noqa: F401, E501 + ) + # Summary Groups from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.GaokaoBench import GaokaoBench_summary_groups # noqa: F401, E501 - from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ - mathbench_2024_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mathbench_v1_2024 import ( + mathbench_2024_summary_groups, # noqa: F401, E501 + ) from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups # noqa: F401, E501 diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py index cbf5c51766..26caa0b103 100644 --- a/.github/scripts/eval_chat_config.py +++ b/.github/scripts/eval_chat_config.py @@ -10,88 +10,120 @@ from opencompass.configs.datasets.ceval.ceval_gen_2daf24 import ceval_datasets # noqa: F401, E501 from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets # noqa: F401, E501 from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets # noqa: F401, E501 - from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \ - GaokaoBench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import ( + GaokaoBench_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets # noqa: F401, E501 from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets # noqa: F401, E501 - from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ - hellaswag_datasets # noqa: F401, E501 + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import ( + hellaswag_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets # noqa: F401, E501 from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets # noqa: F401, E501 from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets # noqa: F401, E501 from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_a0fc46 import sanitized_mbpp_datasets # noqa: F401, E501 from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets # noqa: F401, E501 - from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \ - mmlu_pro_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import ( + mmlu_pro_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.nq.nq_open_1shot_gen_01cf41 import nq_datasets # noqa: F401, E501 from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets # noqa: F401, E501 from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets # noqa: F401, E501 - from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import \ - triviaqa_datasets # noqa: F401, E501 - from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \ - winogrande_datasets # noqa: F401, E501 + from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import ( + triviaqa_datasets, # noqa: F401, E501 + ) + from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import ( + winogrande_datasets, # noqa: F401, E501 + ) + # read models - from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \ - models as hf_baichuan2_chat_7b # noqa: F401, E501 + from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import ( + models as hf_baichuan2_chat_7b, # noqa: F401, E501 + ) from opencompass.configs.models.gemma.hf_gemma2_9b_it import models as hf_gemma2_9b_it # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \ - models as hf_internlm2_5_7b_chat # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \ - models as hf_internlm2_5_20b_chat # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \ - models as hf_internlm2_chat_7b # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import \ - models as hf_internlm2_chat_20b # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ - models as lmdeploy_internlm2_5_7b_chat # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \ - models as lmdeploy_internlm2_5_20b_chat # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \ - models as lmdeploy_internlm2_chat_7b # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import \ - models as lmdeploy_internlm2_chat_20b # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \ - models as lmdeploy_internlm3_8b_instruct # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm_chat_7b import \ - models as lmdeploy_internlm_chat_7b # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import ( + models as hf_internlm2_5_7b_chat, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import ( + models as hf_internlm2_5_20b_chat, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import ( + models as hf_internlm2_chat_7b, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import ( + models as hf_internlm2_chat_20b, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import ( + models as lmdeploy_internlm2_5_7b_chat, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import ( + models as lmdeploy_internlm2_5_20b_chat, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import ( + models as lmdeploy_internlm2_chat_7b, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import ( + models as lmdeploy_internlm2_chat_20b, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import ( + models as lmdeploy_internlm3_8b_instruct, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm_chat_7b import ( + models as lmdeploy_internlm_chat_7b, # noqa: F401, E501 + ) from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as hf_llama2_chat_7b # noqa: F401, E501 - from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \ - models as hf_llama3_1_8b_instruct # noqa: F401, E501 - from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \ - models as hf_llama_3_8b_instruct # noqa: F401, E501 - from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \ - models as lmdeploy_llama2_7b_chat # noqa: F401, E501 - from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \ - models as lmdeploy_llama3_1_8b_instruct # noqa: F401, E501 - from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \ - models as lmdeploy_llama3_8b_instruct # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_1 import \ - models as hf_mistral_chat_7b # noqa: F401, E501 - from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \ - models as hf_mixtral_chat_8x7b # noqa: F401, E501 - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \ - models as lmdeploy_qwen2_5_7b_instruct # noqa: F401, E501 - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import \ - models as lmdeploy_qwen2_5_32b_instruct # noqa: F401, E501 + from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import ( + models as hf_llama3_1_8b_instruct, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import ( + models as hf_llama_3_8b_instruct, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import ( + models as lmdeploy_llama2_7b_chat, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import ( + models as lmdeploy_llama3_1_8b_instruct, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import ( + models as lmdeploy_llama3_8b_instruct, # noqa: F401, E501 + ) + from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_1 import ( + models as hf_mistral_chat_7b, # noqa: F401, E501 + ) + from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import ( + models as hf_mixtral_chat_8x7b, # noqa: F401, E501 + ) from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as hf_qwen1_5_chat_7b # noqa: F401, E501 - from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b_chat import \ - models as hf_qwen1_5_moe_a2_7b_chat # noqa: F401, E501 + from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b_chat import ( + models as hf_qwen1_5_moe_a2_7b_chat, # noqa: F401, E501 + ) from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import models as hf_qwen2_7b_instruct # noqa: F401, E501 from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as hf_qwen_chat_7b # noqa: F401, E501 - from opencompass.configs.models.qwen.lmdeploy_qwen1_5_7b_chat import \ - models as lmdeploy_qwen1_5_7b_chat # noqa: F401, E501 - from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \ - models as lmdeploy_qwen2_7b_instruct # noqa: F401, E501 - from opencompass.configs.models.qwen.lmdeploy_qwen_7b_chat import \ - models as lmdeploy_qwen_7b_chat # noqa: F401, E501 + from opencompass.configs.models.qwen.lmdeploy_qwen1_5_7b_chat import ( + models as lmdeploy_qwen1_5_7b_chat, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import ( + models as lmdeploy_qwen2_7b_instruct, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen.lmdeploy_qwen_7b_chat import ( + models as lmdeploy_qwen_7b_chat, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import ( + models as lmdeploy_qwen2_5_7b_instruct, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import ( + models as lmdeploy_qwen2_5_32b_instruct, # noqa: F401, E501 + ) + # Summary Groups from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.ds1000 import ds1000_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.GaokaoBench import GaokaoBench_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.humanevalx import humanevalx_summary_groups # noqa: F401, E501 - from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \ - mathbench_2024_summary_groups # noqa: F401, E501 + from opencompass.configs.summarizers.groups.mathbench_v1_2024 import ( + mathbench_2024_summary_groups, # noqa: F401, E501 + ) from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups # noqa: F401, E501 from opencompass.configs.summarizers.groups.scicode import scicode_summary_groups # noqa: F401, E501 diff --git a/.github/scripts/eval_regression_base_models.py b/.github/scripts/eval_regression_base_models.py index 235ac812a0..26c1cccccd 100644 --- a/.github/scripts/eval_regression_base_models.py +++ b/.github/scripts/eval_regression_base_models.py @@ -7,41 +7,57 @@ from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets # noqa: F401, E501 from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets # noqa: F401, E501 from opencompass.configs.datasets.race.race_ppl import race_datasets # noqa: F401, E501 - from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \ - winogrande_datasets # noqa: F401, E501 + from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import ( + winogrande_datasets, # noqa: F401, E501 + ) + # read hf models - chat models from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import models as lmdeploy_glm4_9b_model # noqa: F401, E501 - from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \ - models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501 - from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \ - models as lmdeploy_deepseek_67b_base_model # noqa: F401, E501 + from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import ( + models as lmdeploy_deepseek_7b_base_model, # noqa: F401, E501 + ) + from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import ( + models as lmdeploy_deepseek_67b_base_model, # noqa: F401, E501 + ) from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2 import lmdeploy_deepseek_v2_model # noqa: F401, E501 from opencompass.configs.models.gemma.lmdeploy_gemma_9b import models as pytorch_gemma_9b_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \ - models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \ - models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import \ - models as lmdeploy_internlm2_20b_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \ - models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501 - from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \ - models as lmdeploy_llama3_1_8b_model # noqa: F401, E501 - from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \ - models as lmdeploy_llama3_8b_model # noqa: F401, E501 - from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import \ - models as lmdeploy_llama3_70b_model # noqa: F401, E501 - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \ - models as lmdeploy_qwen2_5_1_5b_model # noqa: F401, E501 - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \ - models as lmdeploy_qwen2_5_7b_model # noqa: F401, E501 - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b import \ - models as lmdeploy_qwen2_5_32b_model # noqa: F401, E501 - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b import \ - models as lmdeploy_qwen2_5_72b_model # noqa: F401, E501 - from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \ - models as lmdeploy_qwen2_1_5b_model # noqa: F401, E501 + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import ( + models as lmdeploy_internlm2_1_8b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import ( + models as lmdeploy_internlm2_5_7b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import ( + models as lmdeploy_internlm2_20b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import ( + models as lmdeploy_internlm2_base_7b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import ( + models as lmdeploy_llama3_1_8b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import ( + models as lmdeploy_llama3_8b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import ( + models as lmdeploy_llama3_70b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import ( + models as lmdeploy_qwen2_1_5b_model, # noqa: F401, E501 + ) from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import models as lmdeploy_qwen2_7b_model # noqa: F401, E501 + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import ( + models as lmdeploy_qwen2_5_1_5b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import ( + models as lmdeploy_qwen2_5_7b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b import ( + models as lmdeploy_qwen2_5_32b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b import ( + models as lmdeploy_qwen2_5_72b_model, # noqa: F401, E501 + ) from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b import models as lmdeploy_yi_1_5_9b_model # noqa: F401, E501 from .volc import infer as volc_infer # noqa: F401, E501 diff --git a/.github/scripts/eval_regression_chat_models.py b/.github/scripts/eval_regression_chat_models.py index de7edb8e2f..9495ca6b4f 100644 --- a/.github/scripts/eval_regression_chat_models.py +++ b/.github/scripts/eval_regression_chat_models.py @@ -7,71 +7,104 @@ from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets # noqa: F401, E501 from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import ifeval_datasets # noqa: F401, E501 from opencompass.configs.datasets.math.math_0shot_gen_11c4b5 import math_datasets # noqa: F401, E501 + # read hf models - chat models - from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \ - models as lmdeploy_glm4_9b_chat_model # noqa: F401, E501 - from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \ - models as lmdeploy_deepseek_r1_distill_qwen_32b_model # noqa: F401, E501 - from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \ - models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501 - from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \ - models as lmdeploy_deepseek_v2_lite_model # noqa: F401, E501 - from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \ - models as pytorch_gemma_9b_it_model # noqa: F401, E501 - from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \ - models as pytorch_gemma_27b_it_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ - models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \ - models as lmdeploy_internlm2_5_20b_chat_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \ - models as lmdeploy_internlm2_chat_1_8b_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \ - models as lmdeploy_internlm2_chat_1_8b_sft_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \ - models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \ - models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501 - from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \ - models as lmdeploy_internlm3_8b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \ - models as lmdeploy_llama2_7b_chat_model # noqa: F401, E501 - from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \ - models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \ - models as lmdeploy_llama3_2_3b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.hf_llama.lmdeploy_llama3_3_70b_instruct import \ - models as lmdeploy_llama3_3_70b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \ - models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \ - models as lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501 - from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \ - models as lmdeploy_mistral_nemo_instruct_2407_model # noqa: F401, E501 - from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \ - models as lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501 - from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \ - models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501 - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import \ - models as lmdeploy_qwen2_5_0_5b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import \ - models as lmdeploy_qwen2_5_3b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \ - models as lmdeploy_qwen2_5_14b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import \ - models as lmdeploy_qwen2_5_32b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \ - models as lmdeploy_qwen2_5_72b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \ - models as lmdeploy_qwen2_1_5b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \ - models as lmdeploy_qwen2_7b_instruct_model # noqa: F401, E501 - from opencompass.configs.models.yi.lmdeploy_yi_1_5_6b_chat import \ - models as lmdeploy_yi_1_5_6b_chat_model # noqa: F401, E501 - from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \ - models as lmdeploy_yi_1_5_9b_chat_model # noqa: F401, E501 - from opencompass.configs.models.yi.lmdeploy_yi_1_5_34b_chat import \ - models as lmdeploy_yi_1_5_34b_chat_model # noqa: F401, E501 + from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import ( + models as lmdeploy_glm4_9b_chat_model, # noqa: F401, E501 + ) + from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import ( + models as lmdeploy_deepseek_r1_distill_qwen_32b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import ( + models as lmdeploy_deepseek_v2_5_1210_model, # noqa: F401, E501 + ) + from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import ( + models as lmdeploy_deepseek_v2_lite_model, # noqa: F401, E501 + ) + from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import ( + models as pytorch_gemma_9b_it_model, # noqa: F401, E501 + ) + from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import ( + models as pytorch_gemma_27b_it_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import ( + models as lmdeploy_internlm2_5_7b_chat_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import ( + models as lmdeploy_internlm2_5_20b_chat_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import ( + models as lmdeploy_internlm2_chat_1_8b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import ( + models as lmdeploy_internlm2_chat_1_8b_sft_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import ( + models as lmdeploy_internlm2_chat_7b_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import ( + models as lmdeploy_internlm2_chat_7b_sft_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import ( + models as lmdeploy_internlm3_8b_instruct_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import ( + models as lmdeploy_llama2_7b_chat_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import ( + models as lmdeploy_llama3_1_8b_instruct_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import ( + models as lmdeploy_llama3_2_3b_instruct_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama3_3_70b_instruct import ( + models as lmdeploy_llama3_3_70b_instruct_model, # noqa: F401, E501 + ) + from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import ( + models as lmdeploy_llama3_8b_instruct_model, # noqa: F401, E501 + ) + from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import ( + models as lmdeploy_mistral_large_instruct_2411_model, # noqa: F401, E501 + ) + from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import ( + models as lmdeploy_mistral_nemo_instruct_2407_model, # noqa: F401, E501 + ) + from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import ( + models as lmdeploy_mistral_small_instruct_2409_model, # noqa: F401, E501 + ) + from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import ( + models as lmdeploy_nemotron_70b_instruct_hf_model, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import ( + models as lmdeploy_qwen2_1_5b_instruct_model, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import ( + models as lmdeploy_qwen2_7b_instruct_model, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import ( + models as lmdeploy_qwen2_5_0_5b_instruct_model, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import ( + models as lmdeploy_qwen2_5_3b_instruct_model, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import ( + models as lmdeploy_qwen2_5_14b_instruct_model, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import ( + models as lmdeploy_qwen2_5_32b_instruct_model, # noqa: F401, E501 + ) + from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import ( + models as lmdeploy_qwen2_5_72b_instruct_model, # noqa: F401, E501 + ) + from opencompass.configs.models.yi.lmdeploy_yi_1_5_6b_chat import ( + models as lmdeploy_yi_1_5_6b_chat_model, # noqa: F401, E501 + ) + from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import ( + models as lmdeploy_yi_1_5_9b_chat_model, # noqa: F401, E501 + ) + from opencompass.configs.models.yi.lmdeploy_yi_1_5_34b_chat import ( + models as lmdeploy_yi_1_5_34b_chat_model, # noqa: F401, E501 + ) from .volc import infer as volc_infer # noqa: F401, E501 diff --git a/.github/scripts/eval_stable_object_config.py b/.github/scripts/eval_stable_object_config.py index be20037806..328f25d75f 100644 --- a/.github/scripts/eval_stable_object_config.py +++ b/.github/scripts/eval_stable_object_config.py @@ -5,35 +5,43 @@ # choose a list of datasets from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import ARC_c_datasets # noqa: F401, E501 from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets # noqa: F401, E501 - from opencompass.configs.datasets.CHARM.charm_reason_cot_only_gen_f7b7d3 import \ - charm_reason_datasets # noqa: F401, E501 + from opencompass.configs.datasets.CHARM.charm_reason_cot_only_gen_f7b7d3 import ( + charm_reason_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import cmmlu_datasets # noqa: F401, E501 from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import drop_datasets # noqa: F401, E501 from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import ds1000_datasets # noqa: F401, E501 from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets # noqa: F401, E501 from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import gsm8k_datasets # noqa: F401, E501 - from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ - hellaswag_datasets # noqa: F401, E501 - from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \ - humaneval_datasets # noqa: F401, E501 + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import ( + hellaswag_datasets, # noqa: F401, E501 + ) + from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import ( + humaneval_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import humanevalx_datasets # noqa: F401, E501 from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets # noqa: F401, E501 from opencompass.configs.datasets.LCBench.lcbench_gen_5ff288 import LCBench_datasets # noqa: F401, E501 from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets # noqa: F401, E501 from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import mathbench_datasets # noqa: F401, E501 - from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \ - sanitized_mbpp_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import ( + sanitized_mbpp_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets # noqa: F401, E501 - from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \ - mmlu_pro_datasets # noqa: F401, E501 + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import ( + mmlu_pro_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.race.race_cot_gen_d95929 import race_datasets # noqa: F401, E501 from opencompass.configs.datasets.scicode.scicode_gen_085b98 import SciCode_datasets # noqa: F401, E501 - from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \ - BoolQ_datasets # noqa: F401, E501 - from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \ - teval_datasets as teval_en_datasets # noqa: F401, E501 - from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \ - teval_datasets as teval_zh_datasets # noqa: F401, E501 + from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import ( + BoolQ_datasets, # noqa: F401, E501 + ) + from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import ( + teval_datasets as teval_en_datasets, # noqa: F401, E501 + ) + from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import ( + teval_datasets as teval_zh_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets # noqa: F401, E501 from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import wikibench_datasets # noqa: F401, E501 diff --git a/.github/scripts/eval_stable_subject_config.py b/.github/scripts/eval_stable_subject_config.py index c868c7b1e3..e829815221 100644 --- a/.github/scripts/eval_stable_subject_config.py +++ b/.github/scripts/eval_stable_subject_config.py @@ -6,19 +6,25 @@ with read_base(): # choose a list of datasets - from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import \ - alignbench_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \ - alpacav2_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \ - arenahard_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \ - compassarena_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import ( + alignbench_datasets, # noqa: F401, E501 + ) + from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import ( + alpacav2_datasets, # noqa: F401, E501 + ) + from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import ( + arenahard_datasets, # noqa: F401, E501 + ) + from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import ( + compassarena_datasets, # noqa: F401, E501 + ) from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import fofo_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \ - mtbench101_datasets # noqa: F401, E501 - from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \ - wildbench_datasets # noqa: F401, E501 + from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import ( + mtbench101_datasets, # noqa: F401, E501 + ) + from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import ( + wildbench_datasets, # noqa: F401, E501 + ) datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') and 'wildbench' not in k), []) datasets += wildbench_datasets diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d23faca57a..0d4cf15d30 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,21 +1,11 @@ repos: - - repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.15.4 hooks: - - id: flake8 - args: ['--extend-ignore=E231', "--max-line-length=120"] - - repo: https://github.com/PyCQA/isort - rev: 5.11.5 - hooks: - - id: isort - args: ["--line-length=120"] - - repo: https://github.com/google/yapf - rev: v0.43.0 - hooks: - - id: yapf - args: ['-i', '--style={based_on_style: pep8, column_limit: 120}'] + - id: ruff-check + args: ["--fix", "--exit-non-zero-on-fix"] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v6.0.0 hooks: - id: trailing-whitespace - id: check-yaml @@ -23,8 +13,6 @@ repos: - id: requirements-txt-fixer - id: double-quote-string-fixer - id: check-merge-conflict - - id: fix-encoding-pragma - args: ["--remove"] - id: mixed-line-ending args: ["--fix=lf"] diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py index 7ea918415d..abd7f49d4e 100644 --- a/autotest/interface/pipeline/test_pipeline_func.py +++ b/autotest/interface/pipeline/test_pipeline_func.py @@ -3,9 +3,14 @@ import pydantic import pytest from utils.config_utils import set_device_env_variable, unset_device_env_variable -from utils.pipeline_chat import (assert_pipeline_batch_return, assert_pipeline_batch_stream_return, - assert_pipeline_common_log, assert_pipeline_single_return, - assert_pipeline_single_stream_return, save_pipeline_common_log) +from utils.pipeline_chat import ( + assert_pipeline_batch_return, + assert_pipeline_batch_stream_return, + assert_pipeline_common_log, + assert_pipeline_single_return, + assert_pipeline_single_stream_return, + save_pipeline_common_log, +) from utils.restful_return_check import has_repeated_fragment from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline @@ -206,15 +211,15 @@ def run_pipeline_testcase_special_words_false(config, model, backend, file_name) model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) pipe = init_pipeline(model_path, backend_config=backend_config) - prompt = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' + \ - '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \ - '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' + \ - '发送含有 Python >代码的消息时,它将在该环境中执行。这个工具适用于多种场景,' + \ - '如数据分析或处理(包括数据操作、统计分析、图表绘制),复杂的计算问题(解决数学和物理' + \ - '难题),编程示例(理解编程概念或特性),文本处理和分析(比如文本解析和自然语言处理),机器学习和数据科学(用于' + \ - '展示模型训练和数据可视化),以及文件操作和数据导入(处理CSV、JSON等格式的文件)。<|im_end|>\n' + \ - '<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$,计算曲线积分:$I=\\int_L' + \ - '{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant' + prompt = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' \ + '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \ + '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \ + '发送含有 Python >代码的消息时,它将在该环境中执行。这个工具适用于多种场景,' \ + '如数据分析或处理(包括数据操作、统计分析、图表绘制),复杂的计算问题(解决数学和物理' \ + '难题),编程示例(理解编程概念或特性),文本处理和分析(比如文本解析和自然语言处理),' \ + '机器学习和数据科学(用于展示模型训练和数据可视化),以及文件操作和数据导入(处理CSV、' \ + 'JSON等格式的文件)。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$,' \ + '计算曲线积分:$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant' gen_config = GenerationConfig(skip_special_tokens=False) response = pipe(prompt, gen_config=gen_config) result = '<|action_start|><|interpreter|>' in response.text @@ -226,15 +231,15 @@ def run_pipeline_testcase_special_words_true(config, model, backend, file_name): model_path = '/'.join([config.get('model_path'), model]) backend_config = backend(tp=2) pipe = init_pipeline(model_path, backend_config=backend_config) - prompt = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' + \ - '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \ - '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' + \ - '发送含有 Python >代码的消息时,它将在该环境中执行。这个工具适用于多种场景,' + \ - '如数据分析或处理(包括数据操作、统计分析、图表绘制),复杂的计算问题(解决数学和物理' + \ - '难题),编程示例(理解编程概念或特性),文本处理和分析(比如文本解析和自然语言处理),机器学习和数据科学(用于' + \ - '展示模型训练和数据可视化),以及文件操作和数据导入(处理CSV、JSON等格式的文件)。<|im_end|>\n' + \ - '<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$,计算曲线积分:$I=\\int_L' + \ - '{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant' + prompt = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' \ + '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \ + '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \ + '发送含有 Python >代码的消息时,它将在该环境中执行。这个工具适用于多种场景,' \ + '如数据分析或处理(包括数据操作、统计分析、图表绘制),复杂的计算问题(解决数学和物理' \ + '难题),编程示例(理解编程概念或特性),文本处理和分析(比如文本解析和自然语言处理),' \ + '机器学习和数据科学(用于展示模型训练和数据可视化),以及文件操作和数据导入(处理CSV、' \ + 'JSON等格式的文件)。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$,' \ + '计算曲线积分:$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant' gen_config = GenerationConfig(skip_special_tokens=True) response = pipe(prompt, gen_config=gen_config) result = '<|action_start|><|interpreter|>' not in response.text diff --git a/autotest/interface/restful/test_restful_chat_completions_v1.py b/autotest/interface/restful/test_restful_chat_completions_v1.py index 464c915a53..67e166a568 100644 --- a/autotest/interface/restful/test_restful_chat_completions_v1.py +++ b/autotest/interface/restful/test_restful_chat_completions_v1.py @@ -3,8 +3,11 @@ import pytest from openai import OpenAI from utils.constant import BACKEND_LIST, RESTFUL_MODEL_LIST -from utils.restful_return_check import (assert_chat_completions_batch_return, assert_chat_completions_stream_return, - has_repeated_fragment) +from utils.restful_return_check import ( + assert_chat_completions_batch_return, + assert_chat_completions_stream_return, + has_repeated_fragment, +) from lmdeploy.serve.openai.api_client import APIClient, get_model_list @@ -223,15 +226,15 @@ def test_array_stopwords_streaming(self, backend, model_case): @pytest.mark.internlm2_5 def test_special_words(self, backend, model_case): - message = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' + \ - '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \ - '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' + \ - '发送含有 Python >代码的消息时,它将在该环境中执行。这个工具适用于多种场景,' + \ - '如数据分析或处理(包括数据操作、统计分析、图表绘制),复杂的计算问题(解决数学和物理' + \ - '难题),编程示例(理解编程概念或特性),文本处理和分析(比如文本解析和自然语言处理),机器学习和数据科学(用于' + \ - '展示模型训练和数据可视化),以及文件操作和数据导入(处理CSV、JSON等格式的文件)。<|im_end|>\n' + \ - '<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$,计算曲线积分:$I=\\int_L' + \ - '{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant' + message = '<|im_start|>system\n当开启工具以及代码时,根据需求选择合适的工具进行调用\n' \ + '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \ + '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \ + '发送含有 Python >代码的消息时,它将在该环境中执行。这个工具适用于多种场景,' \ + '如数据分析或处理(包括数据操作、统计分析、图表绘制),复杂的计算问题(解决数学和物理' \ + '难题),编程示例(理解编程概念或特性),文本处理和分析(比如文本解析和自然语言处理),' \ + '机器学习和数据科学(用于展示模型训练和数据可视化),以及文件操作和数据导入(处理CSV、' \ + 'JSON等格式的文件)。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$,' \ + '计算曲线积分:$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant' api_client = APIClient(BASE_URL) model_name = api_client.available_models[0] for output in api_client.chat_completions_v1(model=model_name, diff --git a/autotest/interface/restful/test_restful_completions_v1.py b/autotest/interface/restful/test_restful_completions_v1.py index 03316f6679..d3be161dfe 100644 --- a/autotest/interface/restful/test_restful_completions_v1.py +++ b/autotest/interface/restful/test_restful_completions_v1.py @@ -178,7 +178,8 @@ def test_batch_prompt_order(self, backend, model_case): api_client = APIClient(BASE_URL) model_name = api_client.available_models[0] for item in api_client.completions_v1(model=model_name, - prompt=['你好', '今天天气怎么样', '你是谁', '帮我写一首以梅花为主题的五言律诗', '5+2等于多少'], + prompt=['你好', '今天天气怎么样', '你是谁', + '帮我写一首以梅花为主题的五言律诗', '5+2等于多少'], max_tokens=400, min_tokens=50): print(str(item)) diff --git a/autotest/interface/restful/test_restful_generate.py b/autotest/interface/restful/test_restful_generate.py index cf4c9a463e..389ee47adb 100644 --- a/autotest/interface/restful/test_restful_generate.py +++ b/autotest/interface/restful/test_restful_generate.py @@ -4,7 +4,7 @@ import time from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime -from typing import Any, Dict, List +from typing import Any import pytest import requests @@ -115,8 +115,8 @@ def status_code(self): return resp def _validate_generation_response(self, - data: Dict[str, Any], - expected_fields: List[str] = None, + data: dict[str, Any], + expected_fields: list[str] = None, validate_tokens: bool = True, expect_logprobs: bool = False, validate_experts: bool = False) -> None: diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py index 970eb34469..317a23b3d7 100644 --- a/autotest/tools/chat/test_command_chat_hf_pytorch.py +++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py @@ -1,6 +1,11 @@ import pytest -from tools.common_case_config import (MODELSCOPE_CONFIG, PYTORCH_LORA_TEST_LLM_GPU1, PYTORCH_LORA_TEST_LLM_GPU2, - PYTORCH_PR_TEST_LLM_GPU1, PYTORCH_PR_TEST_LLM_GPU2) +from tools.common_case_config import ( + MODELSCOPE_CONFIG, + PYTORCH_LORA_TEST_LLM_GPU1, + PYTORCH_LORA_TEST_LLM_GPU2, + PYTORCH_PR_TEST_LLM_GPU1, + PYTORCH_PR_TEST_LLM_GPU2, +) from utils.config_utils import get_func_config_list, get_workerid from utils.run_client_chat import run_tests diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py index 46eda4af4b..d8ba6a9472 100644 --- a/autotest/tools/chat/test_command_chat_hf_turbomind.py +++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py @@ -1,7 +1,11 @@ import pytest -from tools.common_case_config import (MODELSCOPE_CONFIG, TURBOMIND_FALLBACK_TEST_LLM_GPU1, - TURBOMIND_FALLBACK_TEST_LLM_GPU2, TURBOMIND_PR_TEST_LLM_GPU1, - TURBOMIND_PR_TEST_LLM_GPU2) +from tools.common_case_config import ( + MODELSCOPE_CONFIG, + TURBOMIND_FALLBACK_TEST_LLM_GPU1, + TURBOMIND_FALLBACK_TEST_LLM_GPU2, + TURBOMIND_PR_TEST_LLM_GPU1, + TURBOMIND_PR_TEST_LLM_GPU2, +) from utils.config_utils import get_func_config_list, get_workerid from utils.run_client_chat import run_tests diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py index bc41a8156c..9443be79e1 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py @@ -1,7 +1,12 @@ import pytest -from tools.common_case_config import (MODELSCOPE_CONFIG, PYTORCH_LORA_TEST_LLM_GPU1, PYTORCH_LORA_TEST_LLM_GPU2, - PYTORCH_PR_TEST_LLM_GPU1, PYTORCH_PR_TEST_LLM_GPU2, - SPECULATIVE_DECODING_PIPELINE_TEST_LLM) +from tools.common_case_config import ( + MODELSCOPE_CONFIG, + PYTORCH_LORA_TEST_LLM_GPU1, + PYTORCH_LORA_TEST_LLM_GPU2, + PYTORCH_PR_TEST_LLM_GPU1, + PYTORCH_PR_TEST_LLM_GPU2, + SPECULATIVE_DECODING_PIPELINE_TEST_LLM, +) from utils.config_utils import get_func_config_list, get_workerid from utils.pipeline_chat import run_pipeline_llm_test diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py index 42801eabb9..894ac1bb59 100644 --- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py +++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py @@ -1,7 +1,11 @@ import pytest -from tools.common_case_config import (MODELSCOPE_CONFIG, TURBOMIND_FALLBACK_TEST_LLM_GPU1, - TURBOMIND_FALLBACK_TEST_LLM_GPU2, TURBOMIND_PR_TEST_LLM_GPU1, - TURBOMIND_PR_TEST_LLM_GPU2) +from tools.common_case_config import ( + MODELSCOPE_CONFIG, + TURBOMIND_FALLBACK_TEST_LLM_GPU1, + TURBOMIND_FALLBACK_TEST_LLM_GPU2, + TURBOMIND_PR_TEST_LLM_GPU1, + TURBOMIND_PR_TEST_LLM_GPU2, +) from utils.config_utils import get_func_config_list, get_workerid from utils.pipeline_chat import run_pipeline_llm_test diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py index c27822eb47..f0c4d7bf07 100644 --- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py @@ -1,9 +1,16 @@ import time import pytest -from tools.common_case_config import (MODELSCOPE_CONFIG, PYTORCH_LORA_TEST_LLM_GPU1, PYTORCH_LORA_TEST_LLM_GPU2, - PYTORCH_PR_TEST_LLM_GPU1, PYTORCH_PR_TEST_LLM_GPU2, REASONING_TEST_LLM, - SPECULATIVE_DECODING_RESTFUL_TEST_LLM, TOOLCALL_TEST_LLM) +from tools.common_case_config import ( + MODELSCOPE_CONFIG, + PYTORCH_LORA_TEST_LLM_GPU1, + PYTORCH_LORA_TEST_LLM_GPU2, + PYTORCH_PR_TEST_LLM_GPU1, + PYTORCH_PR_TEST_LLM_GPU2, + REASONING_TEST_LLM, + SPECULATIVE_DECODING_RESTFUL_TEST_LLM, + TOOLCALL_TEST_LLM, +) from utils.config_utils import get_case_str_by_config, get_func_config_list, get_workerid from utils.constant import PROXY_PORT from utils.proxy_distributed_utils import ApiServerPerTest, proxy_worker_node_wait diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py index 8d2cd95c9c..a7460b6e72 100644 --- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py +++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py @@ -1,8 +1,14 @@ import pytest -from tools.common_case_config import (MODELSCOPE_CONFIG, REASONING_TEST_LLM, TOOLCALL_TEST_LLM, - TURBOMIND_FALLBACK_TEST_LLM_GPU1, TURBOMIND_FALLBACK_TEST_LLM_GPU2, - TURBOMIND_LOGPROBS_TEST_LLM_GPU2, TURBOMIND_PR_TEST_LLM_GPU1, - TURBOMIND_PR_TEST_LLM_GPU2) +from tools.common_case_config import ( + MODELSCOPE_CONFIG, + REASONING_TEST_LLM, + TOOLCALL_TEST_LLM, + TURBOMIND_FALLBACK_TEST_LLM_GPU1, + TURBOMIND_FALLBACK_TEST_LLM_GPU2, + TURBOMIND_LOGPROBS_TEST_LLM_GPU2, + TURBOMIND_PR_TEST_LLM_GPU1, + TURBOMIND_PR_TEST_LLM_GPU2, +) from utils.config_utils import get_func_config_list, get_workerid from utils.run_restful_chat import run_llm_test, run_logprob_test, run_reasoning_case, run_tools_case diff --git a/autotest/utils/common_utils.py b/autotest/utils/common_utils.py index 3a7fcd473f..f54c3aa489 100644 --- a/autotest/utils/common_utils.py +++ b/autotest/utils/common_utils.py @@ -1,14 +1,13 @@ import os import subprocess import sys -from typing import Tuple def execute_command_with_logging(cmd, log_file_path: str, timeout: int = 3600, env=None, - should_print=True) -> Tuple[bool, str]: + should_print=True) -> tuple[bool, str]: if env is None: env = os.environ.copy() diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py index 362a97ac67..5e65681546 100644 --- a/autotest/utils/config_utils.py +++ b/autotest/utils/config_utils.py @@ -1,7 +1,7 @@ import copy import os from collections import OrderedDict -from typing import Any, Dict, List, Optional +from typing import Any import yaml @@ -12,7 +12,7 @@ SUFFIX_INNER_W8A8 = '-inner-w8a8' -def resolve_extra_params(extra_params: Dict[str, Any], model_base_path: str) -> None: +def resolve_extra_params(extra_params: dict[str, Any], model_base_path: str) -> None: """Resolve relative model paths in extra_params to absolute paths. Centralised helper so that every call-site does not need its own @@ -37,10 +37,10 @@ def resolve_extra_params(extra_params: Dict[str, Any], model_base_path: str) -> def get_func_config_list(backend: str, - parallel_config: Dict[str, int], + parallel_config: dict[str, int], model_type: str = 'chat_model', func_type: str = 'func', - extra: Optional[Dict[str, Any]] = None) -> List[Dict]: + extra: dict[str, Any] | None = None) -> list[dict]: """Generate all valid running config combinations (communicator + quant policy + model). @@ -51,7 +51,7 @@ def get_func_config_list(backend: str, func_type: Test func type filter, default: func extra: extra config to update in each run config dict Returns: - List[Dict]: All valid run config dicts + list[dict]: All valid run config dicts """ config = get_config() device = config.get('device', 'cuda') @@ -127,7 +127,7 @@ def get_func_config_list(backend: str, return run_configs -def get_cli_common_param(run_config: Dict[str, Any]) -> str: +def get_cli_common_param(run_config: dict[str, Any]) -> str: """Generate cli common params string by run config dict.""" backend = run_config.get('backend') model = run_config.get('model') @@ -162,7 +162,7 @@ def get_cli_common_param(run_config: Dict[str, Any]) -> str: return ' '.join(cli_params).strip() -def get_cli_str(config: Dict[str, Any]) -> str: +def get_cli_str(config: dict[str, Any]) -> str: cli_str = [] # Extra params for key, value in config.items(): @@ -181,7 +181,7 @@ def get_cli_str(config: Dict[str, Any]) -> str: return ' '.join(cli_str) -def get_parallel_config(config: Dict, model_name: str) -> List[Dict[str, int]]: +def get_parallel_config(config: dict, model_name: str) -> list[dict[str, int]]: """Get matched parallel config dict by model name, default tp:1 if no match.""" result = [] @@ -201,23 +201,23 @@ def get_parallel_config(config: Dict, model_name: str) -> List[Dict[str, int]]: return result if result else [{'tp': 1}] -def _extract_models_from_config(config_value: Any) -> List[str]: +def _extract_models_from_config(config_value: Any) -> list[str]: """Extract flat model name list from config value (dict/list supported)""" models = [] - if isinstance(config_value, Dict): + if isinstance(config_value, dict): for model_list in config_value.values(): - if isinstance(model_list, List): + if isinstance(model_list, list): models.extend([m for m in model_list if isinstance(m, str)]) - elif isinstance(config_value, List): + elif isinstance(config_value, list): models.extend([m for m in config_value if isinstance(m, str)]) return models -def get_model_list(config: Dict, +def get_model_list(config: dict, backend: str, - parallel_config: Dict[str, int] = None, + parallel_config: dict[str, int] = None, model_type: str = 'chat_model', - func_type: str = 'func') -> List[str]: + func_type: str = 'func') -> list[str]: """Get filtered model list with quantization extended models by backend/parallel config/model type/func type. @@ -228,7 +228,7 @@ def get_model_list(config: Dict, model_type: Model type, default: chat_model func_type: Test func type filter, default: func Returns: - List[str]: Base models + quantization extended models + list[str]: Base models + quantization extended models """ model_config_key = f'{backend}_{model_type}' all_models = [] @@ -252,7 +252,7 @@ def get_model_list(config: Dict, return extended_models -def _filter_by_test_func_type(config: Dict, model_list: List[str], func_type: str) -> List[str]: +def _filter_by_test_func_type(config: dict, model_list: list[str], func_type: str) -> list[str]: """Filter model list by test function type, return intersection of two model sets.""" if func_type == 'func': @@ -292,7 +292,7 @@ def _extend_pytorch_quant_models(quant_config: dict, base_models: list, target_l target_list.append(model_name + SUFFIX_INNER_W8A8) -def _is_kvint_model(config: Dict, backend: str, model: str, quant_policy: int) -> bool: +def _is_kvint_model(config: dict, backend: str, model: str, quant_policy: int) -> bool: """Check if model supports the kv quantization policy, quant_policy=0 always return True.""" if quant_policy == 0: @@ -308,7 +308,7 @@ def _base_model_name(model: str) -> str: return model.replace('-inner-4bits', '').replace('-inner-w8a8', '').replace('-inner-gptq', '') -def get_quantization_model_list(type: str) -> List[str]: +def get_quantization_model_list(type: str) -> list[str]: """Get quantization model list by specified quant type(awq/gptq/w8a8)""" config = get_config() quant_model_list = [] @@ -340,7 +340,7 @@ def get_quantization_model_list(type: str) -> List[str]: return quant_model_list -def get_config() -> Dict[str, Any]: +def get_config() -> dict[str, Any]: """Load & get yaml config file, auto adapt device env & update log path.""" # Get device env & match config file path env_tag = os.environ.get('TEST_ENV') @@ -350,7 +350,7 @@ def get_config() -> Dict[str, Any]: if env_tag and not os.path.exists(config_path): config_path = 'autotest/config.yml' # Load yaml config file safely - with open(config_path, 'r', encoding='utf-8') as f: + with open(config_path, encoding='utf-8') as f: config = yaml.load(f.read(), Loader=yaml.SafeLoader) # Deep copy config to avoid modify raw data, update log path with github run id @@ -370,7 +370,7 @@ def get_config() -> Dict[str, Any]: return config_copy -def get_cuda_prefix_by_workerid(worker_id: Optional[str], parallel_config: Dict[str, int] = None) -> Optional[str]: +def get_cuda_prefix_by_workerid(worker_id: str | None, parallel_config: dict[str, int] = None) -> str | None: """Get cuda/ascend visible devices env prefix by worker id & parallel config.""" para_conf = parallel_config or {} @@ -387,7 +387,7 @@ def get_cuda_prefix_by_workerid(worker_id: Optional[str], parallel_config: Dict[ return f'ASCEND_RT_VISIBLE_DEVICES={cuda_id}' if device_type == 'ascend' else f'CUDA_VISIBLE_DEVICES={cuda_id}' -def get_cuda_id_by_workerid(worker_id: Optional[str], tp_num: int = 1) -> Optional[str]: +def get_cuda_id_by_workerid(worker_id: str | None, tp_num: int = 1) -> str | None: """Get cuda id str by worker id and tp num, return None if invalid worker id.""" if worker_id is None or 'gw' not in worker_id: @@ -398,7 +398,7 @@ def get_cuda_id_by_workerid(worker_id: Optional[str], tp_num: int = 1) -> Option return ','.join([str(cuda_num + i) for i in range(tp_num)]) -def get_workerid(worker_id: Optional[str]) -> int: +def get_workerid(worker_id: str | None) -> int: """Parse numeric worker id from worker id str, return 0 if invalid worker id.""" if worker_id is None or 'gw' not in worker_id: @@ -413,7 +413,7 @@ def is_quantization_model(model: str) -> bool: return any(key in lower_name for key in ('awq', '4bits', 'w4', 'int4')) -def _get_communicator_list(config: Dict, backend: str, parallel_config: Dict[str, int] = None) -> List[str]: +def _get_communicator_list(config: dict, backend: str, parallel_config: dict[str, int] = None) -> list[str]: """Get available communicator list by device and parallel config.""" device = config.get('device', None) @@ -429,7 +429,7 @@ def _get_communicator_list(config: Dict, backend: str, parallel_config: Dict[str return ['nccl', 'cuda-ipc'] -def set_device_env_variable(worker_id, parallel_config: Dict[str, int] = None): +def set_device_env_variable(worker_id, parallel_config: dict[str, int] = None): """Set device environment variable based on the device type.""" device = os.environ.get('DEVICE', 'cuda') @@ -460,13 +460,13 @@ def unset_device_env_variable(): del os.environ['CUDA_VISIBLE_DEVICES'] -def is_model_in_list(config: Dict, parallel_config: Dict[str, int], model: str) -> bool: +def is_model_in_list(config: dict, parallel_config: dict[str, int], model: str) -> bool: """Check if model matches the target parallel config.""" model_config = get_parallel_config(config, model) return parallel_config in model_config -def get_case_str_by_config(run_config: Dict[str, Any], is_simple: bool = True) -> str: +def get_case_str_by_config(run_config: dict[str, Any], is_simple: bool = True) -> str: """Generate case name string by run config dict.""" model_name = run_config['model'] backend_type = run_config['backend'] @@ -491,7 +491,7 @@ def get_case_str_by_config(run_config: Dict[str, Any], is_simple: bool = True) - return f'{backend_type}_{pure_model_name}_{communicator}_{parallel_str}_{quant_policy}{extra_params_case}' -def parse_config_by_case(case_str: str) -> Dict[str, Any]: +def parse_config_by_case(case_str: str) -> dict[str, Any]: """Parse run config dict from case name string (fix split & type convert bug)""" case_parts = case_str.split('_') diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py index 3c0a0e91b7..8535e805bf 100644 --- a/autotest/utils/evaluate_utils.py +++ b/autotest/utils/evaluate_utils.py @@ -83,7 +83,7 @@ def llm_summary(case_name, result, msg, work_dir, result_dir=None): if not os.path.exists(csv_file): raise FileNotFoundError('CSV file does not exist') - with open(csv_file, 'r') as f: + with open(csv_file) as f: reader = csv.reader(f) next(reader) for row in reader: @@ -126,7 +126,7 @@ def mllm_summary(case_name, if dataset == 'OCRBench_MINI': score_file = f'{latest_dir}/{case_name}_{dataset}_score.json' cur_score = 0 - with open(score_file, 'r') as f: + with open(score_file) as f: total_score = json.load(f) cur_score = total_score['Final Score Norm'] metrics[dataset] = f'{cur_score:.2f}' # noqa: E231 diff --git a/autotest/utils/mp_log_utils.py b/autotest/utils/mp_log_utils.py index a80bbaa8ff..fdd7d4f1c9 100644 --- a/autotest/utils/mp_log_utils.py +++ b/autotest/utils/mp_log_utils.py @@ -22,7 +22,7 @@ def write_log(config, result, msg, is_new: bool = True, case_path_tag: str = 'de def assert_log(config, case_path_tag: str = 'default'): log_path = os.path.join(config.get('log_path'), case_path_tag) - with open(log_path, 'r') as f: + with open(log_path) as f: lines = f.readlines() for line in lines: diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py index f3e6694840..77a45aa6df 100644 --- a/autotest/utils/pipeline_chat.py +++ b/autotest/utils/pipeline_chat.py @@ -43,7 +43,7 @@ def run_pipeline_llm_test(config, run_config, common_case_config, worker_id: str with assume: assert result, stderr - with open(pipeline_log, 'r', encoding='utf-8') as file: + with open(pipeline_log, encoding='utf-8') as file: output_text = file.read() with open(pipeline_log, 'a') as file: @@ -101,7 +101,7 @@ def run_pipeline_mllm_test(config, run_config, worker_id: str = '', is_smoke: bo with assume: assert result, stderr - with open(pipeline_log, 'r', encoding='utf-8') as file: + with open(pipeline_log, encoding='utf-8') as file: output_text = file.read() with open(pipeline_log, 'a') as file: @@ -156,7 +156,7 @@ def run_pipeline_mllm_test(config, run_config, worker_id: str = '', is_smoke: bo if 'qwen' in model.lower(): Qwen_vl_testcase(output_text, file) - with open(pipeline_log, 'r', encoding='utf-8') as file: + with open(pipeline_log, encoding='utf-8') as file: output_text = file.read() print(output_text) allure.attach.file(pipeline_log, name=pipeline_log, attachment_type=allure.attachment_type.TEXT) @@ -356,7 +356,7 @@ def assert_pipeline_common_log(config, log_name): msg = 'result is empty, please check again' result = False - with open(config_log, 'r') as f: + with open(config_log) as f: lines = f.readlines() for line in lines: diff --git a/autotest/utils/proxy_distributed_utils.py b/autotest/utils/proxy_distributed_utils.py index dc4efdebad..fa8afe7997 100644 --- a/autotest/utils/proxy_distributed_utils.py +++ b/autotest/utils/proxy_distributed_utils.py @@ -3,7 +3,7 @@ import socket import subprocess import time -from typing import Any, Dict, Tuple +from typing import Any import requests from utils.config_utils import get_case_str_by_config, get_cli_common_param, resolve_extra_params @@ -22,13 +22,13 @@ def is_port_open(host: str, port: int, timeout: float = 1.0) -> bool: try: s.connect((host, port)) return True - except (socket.timeout, ConnectionRefusedError, OSError): + except (TimeoutError, ConnectionRefusedError, OSError): return False def check_nodes_status(host: str, proxy_port: int, model_name: str, expected_instances: int, check_count: int, current_time: float, last_progress_print: float, - progress_print_interval: int) -> Tuple[bool, int]: + progress_print_interval: int) -> tuple[bool, int]: try: nodes_url = f'http://{host}:{proxy_port}/nodes/status' resp = requests.get(nodes_url, timeout=10) @@ -215,7 +215,7 @@ def cleanup(self): class ApiServerPerTest: - def __init__(self, proxy_manager: ProxyDistributedManager, config: Dict[str, Any], run_config: Dict[str, Any]): + def __init__(self, proxy_manager: ProxyDistributedManager, config: dict[str, Any], run_config: dict[str, Any]): self.proxy_manager = proxy_manager self.config = config self.run_config = run_config diff --git a/autotest/utils/ray_distributed_utils.py b/autotest/utils/ray_distributed_utils.py index 2b87a4bb41..6b26c91a7b 100644 --- a/autotest/utils/ray_distributed_utils.py +++ b/autotest/utils/ray_distributed_utils.py @@ -4,7 +4,7 @@ import subprocess import time from time import time as time_time -from typing import Any, Dict +from typing import Any import requests from utils.config_utils import get_case_str_by_config, get_cli_common_param, resolve_extra_params @@ -252,7 +252,7 @@ def cleanup(self, force: bool = True): print(f'⚠️ Ray stop exception: {e}') self._cleaned = True # Only mark as "fully cleaned" when force=True - def get_cluster_info(self) -> Dict[str, Any]: + def get_cluster_info(self) -> dict[str, Any]: return { 'node_rank': self.node_rank, 'node_count': self.node_count, diff --git a/autotest/utils/restful_return_check.py b/autotest/utils/restful_return_check.py index 3dbbd2902a..b425b809da 100644 --- a/autotest/utils/restful_return_check.py +++ b/autotest/utils/restful_return_check.py @@ -46,16 +46,16 @@ def assert_usage(usage): def assert_logprobs(logprobs, logprobs_num): assert_logprob_element(logprobs) assert len(logprobs.get('top_logprobs')) >= 0 - assert type(logprobs.get('top_logprobs')) == list + assert type(logprobs.get('top_logprobs')) is list assert len(logprobs.get('top_logprobs')) <= logprobs_num for logprob_element in logprobs.get('top_logprobs'): assert_logprob_element(logprob_element) def assert_logprob_element(logprob): - assert len(logprob.get('token')) > 0 and type(logprob.get('token')) == str - assert len(logprob.get('bytes')) > 0 and type(logprob.get('bytes')) == list - assert type(logprob.get('logprob')) == float + assert len(logprob.get('token')) > 0 and type(logprob.get('token')) is str + assert len(logprob.get('bytes')) > 0 and type(logprob.get('bytes')) is list + assert type(logprob.get('logprob')) is float def assert_chat_completions_stream_return(output, diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py index 13192d37c5..94a605d176 100644 --- a/autotest/utils/run_restful_chat.py +++ b/autotest/utils/run_restful_chat.py @@ -8,8 +8,13 @@ import requests from openai import OpenAI from pytest_assume.plugin import assume -from utils.config_utils import (get_case_str_by_config, get_cli_common_param, get_cuda_prefix_by_workerid, get_workerid, - resolve_extra_params) +from utils.config_utils import ( + get_case_str_by_config, + get_cli_common_param, + get_cuda_prefix_by_workerid, + get_workerid, + resolve_extra_params, +) from utils.constant import DEFAULT_PORT, DEFAULT_SERVER from utils.restful_return_check import assert_chat_completions_batch_return from utils.rule_condition_assert import assert_result @@ -82,7 +87,7 @@ def start_openai_service(config, run_config, worker_id, timeout: int = 1200): # Check if process is still running return_code = startRes.wait(timeout=1) # Small timeout to check status if return_code != 0: - with open(server_log, 'r') as f: + with open(server_log) as f: content = f.read() print(content) return 0, content @@ -576,8 +581,6 @@ def _run_tools_case(log_path, port: int = DEFAULT_PORT): timestamp = time.strftime('%Y%m%d_%H%M%S') restful_log = os.path.join(log_path, f'restful_toolcall_{model}_{str(port)}_{timestamp}.log') - file = open(restful_log, 'w') - client = OpenAI(api_key='YOUR_API_KEY', base_url=http_url + '/v1') model_name = client.models.list().data[0].id @@ -729,7 +732,7 @@ def start_proxy_server(log_path, port, case_name: str = 'default'): # Check if process is still running return_code = proxy_process.wait(timeout=1) # Small timeout to check status if return_code != 0: - with open(proxy_log, 'r') as f: + with open(proxy_log) as f: content = f.read() print(content) return 0, proxy_process diff --git a/autotest/utils/toolkit.py b/autotest/utils/toolkit.py index 7341c9d044..606609870f 100644 --- a/autotest/utils/toolkit.py +++ b/autotest/utils/toolkit.py @@ -1,5 +1,4 @@ from functools import lru_cache -from typing import List from transformers import AutoTokenizer @@ -31,7 +30,7 @@ def _load_tokenizer_cached(model_path: str): raise RuntimeError(f"Failed to load tokenizer from '{model_path}': {e}") -def encode_text(model_path: str, text: str) -> List[int]: +def encode_text(model_path: str, text: str) -> list[int]: tokenizer = _load_tokenizer_cached(model_path) encoded = tokenizer.encode(text) diff --git a/benchmark/benchmark_decode.py b/benchmark/benchmark_decode.py index 3dd20a90c0..d74dadb908 100644 --- a/benchmark/benchmark_decode.py +++ b/benchmark/benchmark_decode.py @@ -5,9 +5,8 @@ import fire import numpy as np -from transformers import AutoTokenizer - from lmdeploy.pytorch.decode import Engine +from transformers import AutoTokenizer def benchmark(model_path, share_gpt_path, downsample=100, accel=None, save_to='decode_result'): @@ -17,7 +16,7 @@ def benchmark(model_path, share_gpt_path, downsample=100, accel=None, save_to='d """ start = time.monotonic() - content = json.load(open(share_gpt_path, 'r')) + content = json.load(open(share_gpt_path)) texts = [] for c in content: diff --git a/benchmark/benchmark_pipeline.py b/benchmark/benchmark_pipeline.py index 64a3deb721..63d2a2bee8 100644 --- a/benchmark/benchmark_pipeline.py +++ b/benchmark/benchmark_pipeline.py @@ -1,6 +1,5 @@ import os import subprocess -from typing import Dict, List import fire import yaml @@ -48,9 +47,9 @@ def benchmark(model_path, backend, engine_config, data_config): tp = engine_config.get('tp', 1) output_file = f'benchmark_pipeline_{model_name}_{backend}_bs{bs}_tp{tp}_cache{cach_ratio}.csv' try: - if isinstance(data_config, Dict): + if isinstance(data_config, dict): data_config = [data_config] - assert isinstance(data_config, List) and all(isinstance(d, Dict) for d in data_config) + assert isinstance(data_config, list) and all(isinstance(d, dict) for d in data_config) for _data_config in data_config: _data_config['csv'] = output_file cmd = get_cmd(model_path, backend, engine_config, _data_config) @@ -61,13 +60,13 @@ def benchmark(model_path, backend, engine_config, data_config): def main(model_path=None, backend=None, config_path=None): - with open(config_path, 'r') as f: + with open(config_path) as f: config = yaml.safe_load(f) engine_configs = config['engine'] data_config = config['data'] - if isinstance(engine_configs, Dict): + if isinstance(engine_configs, dict): engine_configs = [engine_configs] - assert isinstance(engine_configs, List) and all(isinstance(s, Dict) for s in engine_configs) + assert isinstance(engine_configs, list) and all(isinstance(s, dict) for s in engine_configs) for engine_config in engine_configs: # The model_path provided by the user will override the model_path in the config file. model_path = model_path or engine_config.pop('model_path') diff --git a/benchmark/benchmark_serving.py b/benchmark/benchmark_serving.py index 2527507d33..be6785cfd5 100644 --- a/benchmark/benchmark_serving.py +++ b/benchmark/benchmark_serving.py @@ -1,7 +1,6 @@ import os import subprocess import time -from typing import Dict, List, Optional, Tuple import fire import yaml @@ -55,7 +54,7 @@ def get_output_file(model_path, backend, server_config): return f'benchmark_{model_name}_{backend}_{params_str}.csv' -def get_server_ip_port(backend: str, server_config: Dict) -> Tuple[str, int]: +def get_server_ip_port(backend: str, server_config: dict) -> tuple[str, int]: if backend in ['turbomind', 'pytorch']: if server_config.get('proxy_url'): # If proxy_url is set, we use the proxy server's IP and port @@ -90,7 +89,7 @@ def wait_server_ready(server_ip: str, server_port: int) -> bool: time.sleep(5) -def get_client_cmd(backend: str, server_ip: str, server_port: int, client_config: Dict) -> List[str]: +def get_client_cmd(backend: str, server_ip: str, server_port: int, client_config: dict) -> list[str]: """Generate the client benchmark command.""" current_dir = os.path.dirname(os.path.abspath(__file__)) if backend in ['turbomind', 'pytorch']: @@ -112,7 +111,7 @@ def get_client_cmd(backend: str, server_ip: str, server_port: int, client_config return cmd -def benchmark(model_path: str, backend: str, server_config: Dict, data_config: Dict | List[Dict]): +def benchmark(model_path: str, backend: str, server_config: dict, data_config: dict | list[dict]): """Benchmark the server with the given configuration. Args: @@ -121,9 +120,9 @@ def benchmark(model_path: str, backend: str, server_config: Dict, data_config: D server_config: Configuration for the server and the inference engine. data_config: Configuration for the data. """ - if isinstance(data_config, Dict): + if isinstance(data_config, dict): data_config = [data_config] - if not (isinstance(data_config, List) and all(isinstance(d, Dict) for d in data_config)): + if not (isinstance(data_config, list) and all(isinstance(d, dict) for d in data_config)): raise ValueError('data_config must be a dict or list of dicts') server_cmd = get_launching_server_cmd(model_path, backend, server_config) @@ -166,7 +165,7 @@ def benchmark(model_path: str, backend: str, server_config: Dict, data_config: D proc.kill() -def validate_config(config: Dict) -> None: +def validate_config(config: dict) -> None: """Validate the configuration structure. Args: @@ -180,14 +179,14 @@ def validate_config(config: Dict) -> None: if section not in config: raise ValueError(f'Missing required config section: {section}') - if not isinstance(config['engine'], (Dict, List)): + if not isinstance(config['engine'], (dict, list)): raise ValueError('engine config must be a dict or list of dicts') - if not isinstance(config['data'], (Dict, List)): + if not isinstance(config['data'], (dict, list)): raise ValueError('data config must be a dict or list of dicts') -def main(backend: str, config_path: str, model_path: Optional[str] = None): +def main(backend: str, config_path: str, model_path: str | None = None): """Main entry point for the benchmark script. Args: @@ -197,14 +196,14 @@ def main(backend: str, config_path: str, model_path: Optional[str] = None): Raises: BenchmarkConfigError: If required parameters are missing or config is invalid """ - with open(config_path, 'r') as f: + with open(config_path) as f: config = yaml.safe_load(f) server_config = config['server'] engine_configs = config['engine'] data_config = config['data'] - if isinstance(engine_configs, Dict): + if isinstance(engine_configs, dict): engine_configs = [engine_configs] - assert isinstance(engine_configs, List) and all(isinstance(s, Dict) for s in engine_configs) + assert isinstance(engine_configs, list) and all(isinstance(s, dict) for s in engine_configs) for engine_config in engine_configs: server_config = server_config.copy() server_config.update(engine_config) # Merge engine config with server config diff --git a/benchmark/benchmark_throughput.py b/benchmark/benchmark_throughput.py index 49747d96c5..e8fc57d8f3 100644 --- a/benchmark/benchmark_throughput.py +++ b/benchmark/benchmark_throughput.py @@ -1,6 +1,5 @@ import os import subprocess -from typing import Dict, List import fire import yaml @@ -48,9 +47,9 @@ def benchmark(model_path, backend, engine_config, data_config): tp = engine_config.get('tp', 1) output_file = f'benchmark_throughput_{model_name}_{backend}_bs{bs}_tp{tp}_cache{cach_ratio}.csv' try: - if isinstance(data_config, Dict): + if isinstance(data_config, dict): data_config = [data_config] - assert isinstance(data_config, List) and all(isinstance(d, Dict) for d in data_config) + assert isinstance(data_config, list) and all(isinstance(d, dict) for d in data_config) for _data_config in data_config: _data_config['csv'] = output_file cmd = get_cmd(model_path, backend, engine_config, _data_config) @@ -61,13 +60,13 @@ def benchmark(model_path, backend, engine_config, data_config): def main(model_path=None, backend=None, config_path=None): - with open(config_path, 'r') as f: + with open(config_path) as f: config = yaml.safe_load(f) engine_configs = config['engine'] data_config = config['data'] - if isinstance(engine_configs, Dict): + if isinstance(engine_configs, dict): engine_configs = [engine_configs] - assert isinstance(engine_configs, List) and all(isinstance(s, Dict) for s in engine_configs) + assert isinstance(engine_configs, list) and all(isinstance(s, dict) for s in engine_configs) for engine_config in engine_configs: # The model_path provided by the user will override the model_path in the config file. model_path = model_path or engine_config.pop('model_path') diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py index 57cef20384..e8a67e1fb8 100644 --- a/benchmark/profile_pipeline_api.py +++ b/benchmark/profile_pipeline_api.py @@ -3,7 +3,6 @@ import json import os import random -from typing import List, Optional, Tuple import numpy as np from tqdm import tqdm @@ -21,8 +20,8 @@ def sample_sharegpt_requests( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int] = None, -) -> List[Tuple[str, int, int]]: + fixed_output_len: int | None = None, +) -> list[tuple[str, int, int]]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError('output_len too small') @@ -38,7 +37,7 @@ def sample_sharegpt_requests( random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] + filtered_dataset: list[tuple[str, int, int]] = [] for i in range(len(dataset)): if len(filtered_dataset) == num_requests: break @@ -70,7 +69,7 @@ def sample_random_requests( range_ratio: float, tokenizer: PreTrainedTokenizerBase, dataset_path: str, -) -> List[Tuple[str, int, int]]: +) -> list[tuple[str, int, int]]: input_lens = np.random.randint( max(int(input_len * range_ratio), 1), @@ -101,7 +100,7 @@ def sample_random_requests( random.shuffle(dataset) # Filter out sequences that are too long or too short - input_requests: List[Tuple[str, int, int]] = [] + input_requests: list[tuple[str, int, int]] = [] for i in range(num_prompts): # Tokenize the prompts and completions. prompt = dataset[i][0] @@ -150,7 +149,7 @@ def process_request(self, requests, profiler: Profiler, temperature, top_p, top_ max_new_tokens=output_len) for _, _, output_len in requests ] - sess: List[Session] = [] + sess: list[Session] = [] for _, input_len, output_len in requests: sess.append(profiler.new_session(input_len, output_len)) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index 127e420125..34a8853b52 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -22,9 +22,10 @@ import traceback import warnings from argparse import ArgumentParser +from collections.abc import AsyncGenerator from dataclasses import dataclass, field from datetime import datetime -from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union +from typing import Any import aiohttp import numpy as np @@ -32,8 +33,13 @@ import requests from PIL import Image from tqdm.asyncio import tqdm -from transformers import (AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerBase, - PreTrainedTokenizerFast) +from transformers import ( + AutoProcessor, + AutoTokenizer, + PreTrainedTokenizer, + PreTrainedTokenizerBase, + PreTrainedTokenizerFast, +) AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=None) @@ -58,8 +64,8 @@ class RequestFuncInput: prompt_len: int output_len: int model: str - image_data: Optional[List[str]] - extra_request_body: Dict[str, Any] + image_data: list[str] | None + extra_request_body: dict[str, Any] @dataclass @@ -68,7 +74,7 @@ class RequestFuncOutput: success: bool = False latency: float = 0.0 ttft: float = 0.0 # Time to first token - itl: List[float] = field(default_factory=list) # List of inter-token latencies + itl: list[float] = field(default_factory=list) # List of inter-token latencies prompt_len: int = 0 output_len: int = 0 error: str = '' @@ -82,7 +88,7 @@ def remove_prefix(text: str, prefix: str) -> str: # https://github.com/triton-inference-server/tensorrtllm_backend/issues/505 async def async_request_trt_llm( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith('generate_stream') @@ -152,7 +158,7 @@ async def async_request_trt_llm( # set ignore_eos True by default async def async_request_openai_completions( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith('completions'), "OpenAI Completions API URL must end with 'completions'." @@ -230,7 +236,7 @@ async def async_request_openai_completions( async def async_request_openai_chat_completions( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url assert api_url.endswith('chat/completions'), "OpenAI Chat Completions API URL must end with 'chat/completions'." @@ -338,7 +344,7 @@ async def async_request_openai_chat_completions( async def async_request_sglang_generate( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: api_url = request_func_input.api_url prompt = request_func_input.prompt @@ -415,7 +421,7 @@ async def async_request_sglang_generate( async def async_request_gserver( request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, + pbar: tqdm | None = None, ) -> RequestFuncOutput: raise NotImplementedError() @@ -435,7 +441,7 @@ def get_model(pretrained_model_name_or_path: str) -> str: return pretrained_model_name_or_path -def get_tokenizer(pretrained_model_name_or_path: str, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: +def get_tokenizer(pretrained_model_name_or_path: str, ) -> PreTrainedTokenizer | PreTrainedTokenizerFast: if pretrained_model_name_or_path.endswith('.json') or pretrained_model_name_or_path.endswith('.model'): from sglang.srt.hf_transformers_utils import get_tokenizer @@ -446,7 +452,7 @@ def get_tokenizer(pretrained_model_name_or_path: str, ) -> Union[PreTrainedToken return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) -def get_processor(pretrained_model_name_or_path: str, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: +def get_processor(pretrained_model_name_or_path: str, ) -> PreTrainedTokenizer | PreTrainedTokenizerFast: assert (pretrained_model_name_or_path is not None and pretrained_model_name_or_path != '') if pretrained_model_name_or_path.endswith('.json') or pretrained_model_name_or_path.endswith('.model'): from sglang.srt.utils.hf_transformers_utils import get_processor @@ -503,7 +509,7 @@ class BenchmarkMetrics: SHAREGPT_URL = 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json' # noqa -def download_and_cache_file(url: str, filename: Optional[str] = None): +def download_and_cache_file(url: str, filename: str | None = None): """Read and cache a file from a url.""" if filename is None: filename = os.path.join('/tmp', url.split('/')[-1]) @@ -542,9 +548,9 @@ class DatasetRow: prompt: str prompt_len: int output_len: int - text_prompt_len: Optional[int] = None - vision_prompt_len: Optional[int] = None - image_data: Optional[List[str]] = None + text_prompt_len: int | None = None + vision_prompt_len: int | None = None + image_data: list[str] | None = None def __post_init__(self): if self.text_prompt_len is None: @@ -556,7 +562,7 @@ def __post_init__(self): def sample_sharegpt_requests(dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int] = None) -> List[DatasetRow]: + fixed_output_len: int | None = None) -> list[DatasetRow]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError('output_len too small') @@ -576,7 +582,7 @@ def sample_sharegpt_requests(dataset_path: str, random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[DatasetRow] = [] + filtered_dataset: list[DatasetRow] = [] for i in range(len(dataset)): if len(filtered_dataset) == num_requests: break @@ -621,7 +627,7 @@ def sample_random_requests( range_ratio: float, tokenizer: PreTrainedTokenizerBase, dataset_path: str, -) -> List[DatasetRow]: +) -> list[DatasetRow]: input_lens = compute_random_lens( full_len=input_len, @@ -655,8 +661,8 @@ def sample_random_requests( random.shuffle(dataset) # Filter out sequences that are too long or too short - input_requests: List[DatasetRow] = [] - origin_output_lens: List[int] = [] + input_requests: list[DatasetRow] = [] + origin_output_lens: list[int] = [] for i in range(num_prompts): # Tokenize the prompts and completions. prompt = dataset[i][0] @@ -683,7 +689,7 @@ def sample_random_requests( return input_requests -def parse_image_resolution(image_resolution: str) -> Tuple[int, int]: +def parse_image_resolution(image_resolution: str) -> tuple[int, int]: """Parse image resolution into (width, height). Supports presets '1080p', '720p', '360p'. And custom 'heightxwidth' format (e.g., '1080x1920' means height=1080, @@ -802,7 +808,7 @@ def sample_image_requests( image_format: str, image_resolution: str, backend: str, -) -> List[DatasetRow]: +) -> list[DatasetRow]: """Generate requests with images. - Each request includes ``image_count`` images. @@ -836,7 +842,7 @@ def sample_image_requests( num=num_requests, ) - def _gen_random_image_data_uri(width: int = width, height: int = height) -> Tuple[Image.Image, str, int]: + def _gen_random_image_data_uri(width: int = width, height: int = height) -> tuple[Image.Image, str, int]: if image_content == 'blank': # Generate blank white image arr = np.full((height, width, 3), 255, dtype=np.uint8) @@ -851,7 +857,7 @@ def _gen_random_image_data_uri(width: int = width, height: int = height) -> Tupl image_bytes = len(image_data.encode('utf-8')) return img, image_data, image_bytes - dataset: List[DatasetRow] = [] + dataset: list[DatasetRow] = [] total_image_bytes = 0 for i in range(num_requests): # Generate text prompt @@ -885,7 +891,7 @@ def _gen_random_image_data_uri(width: int = width, height: int = height) -> Tupl async def get_request( - input_requests: List[DatasetRow], + input_requests: list[DatasetRow], request_rate: float, ) -> AsyncGenerator[DatasetRow, None]: input_requests = iter(input_requests) @@ -903,22 +909,22 @@ async def get_request( def calculate_metrics( - input_requests: List[DatasetRow], - outputs: List[RequestFuncOutput], + input_requests: list[DatasetRow], + outputs: list[RequestFuncOutput], dur_s: float, tokenizer: PreTrainedTokenizerBase, backend: str, -) -> Tuple[BenchmarkMetrics, List[int]]: - output_lens: List[int] = [] - retokenized_output_lens: List[int] = [] +) -> tuple[BenchmarkMetrics, list[int]]: + output_lens: list[int] = [] + retokenized_output_lens: list[int] = [] total_input = 0 total_input_text = 0 total_input_vision = 0 completed = 0 - itls: List[float] = [] - tpots: List[float] = [] - ttfts: List[float] = [] - e2e_latencies: List[float] = [] + itls: list[float] = [] + tpots: list[float] = [] + ttfts: list[float] = [] + e2e_latencies: list[float] = [] for i in range(len(outputs)): if outputs[i].success: @@ -982,10 +988,10 @@ async def benchmark( api_url: str, model_id: str, tokenizer: PreTrainedTokenizerBase, - input_requests: List[DatasetRow], + input_requests: list[DatasetRow], request_rate: float, disable_tqdm: bool, - extra_request_body: Dict[str, Any], + extra_request_body: dict[str, Any], ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -1018,7 +1024,7 @@ async def benchmark( pbar = None if disable_tqdm else tqdm(total=len(input_requests)) benchmark_start_time = time.perf_counter() - tasks: List[asyncio.Task] = [] + tasks: list[asyncio.Task] = [] async for request in get_request(input_requests, request_rate): request_func_input = RequestFuncInput( model=model_id, @@ -1030,7 +1036,7 @@ async def benchmark( extra_request_body=extra_request_body, ) tasks.append(asyncio.create_task(request_func(request_func_input=request_func_input, pbar=pbar))) - outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) if pbar is not None: pbar.close() diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py index 78f545072b..c56b14d5c1 100644 --- a/benchmark/profile_throughput.py +++ b/benchmark/profile_throughput.py @@ -5,7 +5,6 @@ import os import random from queue import Queue -from typing import List, Optional, Tuple, Union import numpy as np from tqdm import tqdm @@ -25,8 +24,8 @@ def sample_sharegpt_requests( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int] = None, -) -> List[Tuple[str, int, int]]: + fixed_output_len: int | None = None, +) -> list[tuple[str, int, int]]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError('output_len too small') # Load the dataset. @@ -41,7 +40,7 @@ def sample_sharegpt_requests( random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] + filtered_dataset: list[tuple[str, int, int]] = [] for i in range(len(dataset)): if len(filtered_dataset) == num_requests: break @@ -73,7 +72,7 @@ def sample_random_requests( range_ratio: float, tokenizer: PreTrainedTokenizerBase, dataset_path: str, -) -> List[Tuple[str, int, int]]: +) -> list[tuple[str, int, int]]: input_lens = np.random.randint( max(int(input_len * range_ratio), 1), @@ -104,7 +103,7 @@ def sample_random_requests( random.shuffle(dataset) # Filter out sequences that are too long or too short - input_requests: List[Tuple[str, int, int]] = [] + input_requests: list[tuple[str, int, int]] = [] for i in range(num_prompts): # Tokenize the prompts and completions. prompt = dataset[i][0] @@ -134,7 +133,7 @@ def sample_random_requests( class Engine: - def __init__(self, model_path: str, engine_config: Union[PytorchEngineConfig, TurbomindEngineConfig]): + def __init__(self, model_path: str, engine_config: PytorchEngineConfig | TurbomindEngineConfig): self.tokenizer = Tokenizer(model_path) if isinstance(engine_config, TurbomindEngineConfig): from lmdeploy.turbomind import TurboMind diff --git a/docs/en/conf.py b/docs/en/conf.py index 94ca2a4def..095173d32b 100644 --- a/docs/en/conf.py +++ b/docs/en/conf.py @@ -25,7 +25,7 @@ from lmdeploy.serve.proxy.proxy import app as proxy_server # noqa: E402 version_file = '../../lmdeploy/version.py' -with open(version_file, 'r') as f: +with open(version_file) as f: exec(compile(f.read(), version_file, 'exec')) __version__ = locals()['__version__'] diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py index 5db30be50d..202fb138a5 100644 --- a/docs/zh_cn/conf.py +++ b/docs/zh_cn/conf.py @@ -25,7 +25,7 @@ from lmdeploy.serve.proxy.proxy import app as proxy_server # noqa: E402 version_file = '../../lmdeploy/version.py' -with open(version_file, 'r') as f: +with open(version_file) as f: exec(compile(f.read(), version_file, 'exec')) __version__ = locals()['__version__'] diff --git a/eval/eval.py b/eval/eval.py index 53a2bdb9af..c0b4ea2dd6 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -66,7 +66,7 @@ def read_config(): # Read config file content try: - with open(config_path, 'r', encoding='utf-8') as f: + with open(config_path, encoding='utf-8') as f: config_content = f.read() return config_content except FileNotFoundError: diff --git a/lmdeploy/api.py b/lmdeploy/api.py index 4f0ff34315..11f31c1de4 100644 --- a/lmdeploy/api.py +++ b/lmdeploy/api.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from __future__ import annotations -from typing import TYPE_CHECKING, List, Literal +from typing import TYPE_CHECKING, Literal from typing_extensions import deprecated @@ -13,13 +13,14 @@ def pipeline(model_path: str, - backend_config: 'TurbomindEngineConfig' | 'PytorchEngineConfig' | None = None, - chat_template_config: 'ChatTemplateConfig' | None = None, + backend_config: TurbomindEngineConfig | PytorchEngineConfig | None = None, + chat_template_config: ChatTemplateConfig | None = None, log_level: str = 'WARNING', max_log_len: int | None = None, - speculative_config: 'SpeculativeConfig' | None = None, + speculative_config: SpeculativeConfig | None = None, **kwargs): - """ + """Create a pipeline for inference. + Args: model_path: the path of a model. It could be one of the following options: @@ -34,14 +35,17 @@ def pipeline(model_path: str, on huggingface.co, such as ``internlm/internlm-chat-7b``, ``Qwen/Qwen-7B-Chat``, ``baichuan-inc/Baichuan2-7B-Chat`` and so on. - backend_config: backend - config instance. Default to None. - chat_template_config: chat template configuration. - Default to None. + backend_config: backend config instance. Default to None. + chat_template_config: chat template configuration. Default to None. log_level: set log level whose value among [``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``] max_log_len: Max number of prompt characters or prompt tokens - being printed in log + being printed in log. + speculative_config: speculative decoding configuration. + **kwargs: additional keyword arguments passed to the pipeline. + + Returns: + Pipeline: a pipeline instance for inference. Examples: @@ -62,8 +66,7 @@ def pipeline(model_path: str, im = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg') response = pipe([('describe this image', [im])]) print(response) - - """ # noqa E501 + """ # noqa E501 return Pipeline(model_path, backend_config=backend_config, @@ -78,12 +81,12 @@ def pipeline(model_path: str, def serve(model_path: str, model_name: str | None = None, backend: Literal['turbomind', 'pytorch'] = 'turbomind', - backend_config: 'TurbomindEngineConfig' | 'PytorchEngineConfig' | None = None, - chat_template_config: 'ChatTemplateConfig' | None = None, + backend_config: TurbomindEngineConfig | PytorchEngineConfig | None = None, + chat_template_config: ChatTemplateConfig | None = None, server_name: str = '0.0.0.0', server_port: int = 23333, log_level: str = 'ERROR', - api_keys: List[str] | str | None = None, + api_keys: list[str] | str | None = None, ssl: bool = False, **kwargs): """This function is deprecated and no longer available. @@ -106,11 +109,13 @@ def client(api_server_url: str = 'http://0.0.0.0:23333', api_key: str | None = N Args: api_server_url: communicating address ``http://:`` of - api_server + api_server. api_key: api key. Default to None, which means no api key will be used. - Return: - Chatbot for LLaMA series models with turbomind as inference engine. + + Raises: + NotImplementedError: This function has been deprecated and removed. + Use ``from lmdeploy.serve import APIClient`` instead. """ raise NotImplementedError("The 'client' function is no longer available. This function has been deprecated. " ' Please use "from lmdeploy.serve import APIClient" instead.') diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py index a4fe0d2333..80707cd41f 100644 --- a/lmdeploy/archs.py +++ b/lmdeploy/archs.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import os -from typing import Dict, List, Literal, Tuple +from typing import Literal from transformers import AutoConfig @@ -58,7 +58,7 @@ def autoget_backend(model_path: str) -> Literal['turbomind', 'pytorch']: def autoget_backend_config( model_path: str, backend_config: PytorchEngineConfig | TurbomindEngineConfig | None = None -) -> Tuple[Literal['turbomind', 'pytorch'], PytorchEngineConfig | TurbomindEngineConfig]: +) -> tuple[Literal['turbomind', 'pytorch'], PytorchEngineConfig | TurbomindEngineConfig]: """Get backend config automatically. Args: @@ -78,7 +78,7 @@ def autoget_backend_config( backend = autoget_backend(model_path) config = PytorchEngineConfig() if backend == 'pytorch' else TurbomindEngineConfig() if backend_config is not None: - if type(backend_config) == type(config): + if type(backend_config) is type(config): config = backend_config else: data = asdict(backend_config) @@ -176,15 +176,15 @@ def get_model_arch(model_path: str): def search_nested_config(config, key): """Recursively searches for the value associated with the given key in a nested configuration of a model.""" - if isinstance(config, Dict): + if isinstance(config, dict): for k, v in config.items(): if k == key: return v - if isinstance(v, (Dict, List)): + if isinstance(v, (dict, list)): result = search_nested_config(v, key) if result is not None: return result - elif isinstance(config, List): + elif isinstance(config, list): for item in config: result = search_nested_config(item, key) if result is not None: diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py index 76adaca501..e0702aeeaf 100644 --- a/lmdeploy/cli/cli.py +++ b/lmdeploy/cli/cli.py @@ -3,11 +3,16 @@ import os from ..version import __version__ -from .utils import (ArgumentHelper, DefaultsAndTypesHelpFormatter, FlexibleArgumentParser, convert_args, - get_speculative_config) +from .utils import ( + ArgumentHelper, + DefaultsAndTypesHelpFormatter, + FlexibleArgumentParser, + convert_args, + get_speculative_config, +) -class CLI(object): +class CLI: _desc = 'The CLI provides a unified API for converting, ' \ 'compressing and deploying large language models.' parser = FlexibleArgumentParser(prog='lmdeploy', description=_desc, add_help=True) @@ -124,8 +129,7 @@ def get_gpu_topo(): if sys.platform.startswith('linux'): try: res = subprocess.run(['nvidia-smi', 'topo', '-m'], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + capture_output=True, text=True, check=True) if res.returncode == 0: diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py index 768ef47544..0143453aab 100644 --- a/lmdeploy/cli/lite.py +++ b/lmdeploy/cli/lite.py @@ -3,7 +3,7 @@ from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args -class SubCliLite(object): +class SubCliLite: """CLI for compressing LLMs.""" _help = 'Compressing and accelerating LLMs with lmdeploy.lite module' _desc = _help diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py index 3488281f42..155392f4a7 100644 --- a/lmdeploy/cli/serve.py +++ b/lmdeploy/cli/serve.py @@ -3,8 +3,14 @@ from lmdeploy.utils import get_max_batch_size from .cli import CLI -from .utils import (ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args, get_chat_template, get_lora_adapters, - get_speculative_config) +from .utils import ( + ArgumentHelper, + DefaultsAndTypesHelpFormatter, + convert_args, + get_chat_template, + get_lora_adapters, + get_speculative_config, +) class SubCliServe: diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py index 2ccc2c6f2a..6dfeb61aa3 100644 --- a/lmdeploy/cli/utils.py +++ b/lmdeploy/cli/utils.py @@ -5,7 +5,7 @@ import re import sys from collections import defaultdict -from typing import Any, List +from typing import Any from lmdeploy.utils import get_logger @@ -39,14 +39,14 @@ def convert_args(args): return kwargs -def get_lora_adapters(adapters: List[str]): +def get_lora_adapters(adapters: list[str]): """Parse lora adapers from cli input. Args: - adapters (List[str]): CLI input string of lora adapter path(s). + adapters (list[str]): CLI input string of lora adapter path(s). Returns: - Dict[str,str] or None: Parsed lora adapter path(s). + dict[str, str] | None: Parsed lora adapter path(s). """ if not adapters: return None @@ -435,7 +435,7 @@ def calib_search_scale(parser): ) @staticmethod - def device(parser, default: str = 'cuda', choices: List[str] = ['cuda', 'ascend', 'maca', 'camb']): + def device(parser, default: str = 'cuda', choices: list[str] = ['cuda', 'ascend', 'maca', 'camb']): """Add argument device to parser.""" return parser.add_argument('--device', diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py index e8dd1fca23..b5553df10b 100644 --- a/lmdeploy/lite/apis/calibrate.py +++ b/lmdeploy/lite/apis/calibrate.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from pathlib import Path -from typing import Literal, Union +from typing import Literal import torch from torch import nn @@ -76,7 +76,7 @@ def _prepare_for_calibrate(model: nn.Module, - layer_type: Union[str, type], + layer_type: str | type, head_name: str = 'lm_head', device: str = 'cuda', prefix: str = '') -> None: @@ -95,7 +95,7 @@ def _prepare_for_calibrate(model: nn.Module, ---------- model : nn.Module The PyTorch model to prepare for calibration. - layer_type : Union[str, Type] + layer_type : str | type The type of the layer to be moved to CPU. Can be either a string of class name or the class type itself. head_name : str, optional diff --git a/lmdeploy/lite/apis/get_small_sharded_hf.py b/lmdeploy/lite/apis/get_small_sharded_hf.py index 7c5ce8eba3..2d1bebaac1 100644 --- a/lmdeploy/lite/apis/get_small_sharded_hf.py +++ b/lmdeploy/lite/apis/get_small_sharded_hf.py @@ -41,7 +41,7 @@ def main(): state_dict = torch.load(os.path.join(args.src_dir, ckpt), map_location='cuda', weights_only=True) keys = sorted(list(state_dict.keys())) for k in keys: - new_state_dict_name = 'pytorch_model-{:05d}-of-{:05d}.bin'.format(cnt, n_shard) + new_state_dict_name = f'pytorch_model-{cnt:05d}-of-{n_shard:05d}.bin' new_index['weight_map'][k] = new_state_dict_name new_state_dict = {k: state_dict[k]} torch.save(new_state_dict, os.path.join(args.dst_dir, new_state_dict_name)) diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py index c1acafe601..8ee3990409 100644 --- a/lmdeploy/lite/quantization/awq.py +++ b/lmdeploy/lite/quantization/awq.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List import torch @@ -151,7 +150,7 @@ def get_weight_scale(weight, q_group_size=-1): @torch.no_grad() def smooth_ln_fcs(ln: torch.nn.Module, - fcs: List[torch.nn.Module], + fcs: list[torch.nn.Module], act_scales: torch.Tensor, group_size: int = -1, alpha: float = 0.5) -> torch.Tensor: @@ -204,7 +203,7 @@ def smooth_ln_fcs(ln: torch.nn.Module, @torch.no_grad() def smooth_fc_fcs(pre_fc: torch.nn.Module, - fcs: List[torch.nn.Module], + fcs: list[torch.nn.Module], act_scales: torch.Tensor, group_size: int = -1, alpha: float = 0.5) -> torch.Tensor: diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py index de83f29d87..44575d9c7b 100644 --- a/lmdeploy/lite/quantization/calibration.py +++ b/lmdeploy/lite/quantization/calibration.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from functools import partial -from typing import Union import torch from torch import nn @@ -9,11 +8,15 @@ from lmdeploy.lite.quantization.activation import ActivationObserver from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP -from lmdeploy.lite.utils import (bimap_name_mod, collect_target_modules, concat_decoder_layer_outputs, - split_decoder_layer_inputs) +from lmdeploy.lite.utils import ( + bimap_name_mod, + collect_target_modules, + concat_decoder_layer_outputs, + split_decoder_layer_inputs, +) -class CalibrationContext(): +class CalibrationContext: """Calibration context manager for model quantization. Parameters: @@ -30,8 +33,8 @@ class CalibrationContext(): def __init__(self, model: nn.Module, tokenizer: PreTrainedTokenizer, - layer_type: Union[str, type], - norm_type: Union[str, type], + layer_type: str | type, + norm_type: str | type, batch_size: int = 1, device: str = 'cuda', **kwargs) -> None: @@ -40,8 +43,8 @@ def __init__(self, Args: model (nn.Module): Model to be calibrated. tokenizer (PreTrainedTokenizer): Tokenizer of the given model. - layer_type (Union[str, type]): Type of the layers to be observed. - norm_type (Union[str, type]): Norm type used in the model. + layer_type (str | type): Type of the layers to be observed. + norm_type (str | type): Norm type used in the model. batch_size (int): The batch size for running the calib samples. Low GPU mem requires small batch_size. Large batch_size reduces the calibration time while costs more VRAM. @@ -201,7 +204,7 @@ def export(self, out_dir): to specified directory. Args: - out_dir (Union[str, Path]): The directory path where the stats + out_dir (str | Path): The directory path where the stats will be saved. """ @@ -339,8 +342,8 @@ class CalibrationContextV2(CalibrationContext): def __init__(self, model: nn.Module, tokenizer: PreTrainedTokenizer, - layer_type: Union[str, type], - norm_type: Union[str, type], + layer_type: str | type, + norm_type: str | type, batch_size: int = 1, device: str = 'cuda', search_scale: bool = True, @@ -374,7 +377,7 @@ def export(self, out_dir): to specified directory. Args: - out_dir (Union[str, Path]): The directory path where the stats + out_dir (str | Path): The directory path where the stats will be saved. """ inputs_stats = { diff --git a/lmdeploy/lite/quantization/modules/linear.py b/lmdeploy/lite/quantization/modules/linear.py index 854d4cc51f..8041020201 100644 --- a/lmdeploy/lite/quantization/modules/linear.py +++ b/lmdeploy/lite/quantization/modules/linear.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional, Type, TypeVar +from typing import TypeVar import torch from torch import nn @@ -22,14 +22,14 @@ class WeightOnlyQLinear(nn.Module): group_size (int): size of the quantization group. in_features (int): size of each input sample. out_features (int): size of each output sample. - bias (Tensor, optional): Defaults to None. + bias (bool): Defaults to True. """ def __init__( self, in_features: int, out_features: int, - bias: Optional[torch.Tensor] = True, + bias: bool = True, w_bit: int = 4, symmetry: bool = False, group_size: int = 128, @@ -71,11 +71,11 @@ def __init__( self.qzeros = None @classmethod - def from_linear(cls: Type['WeightOnlyQLinear'], + def from_linear(cls: type['WeightOnlyQLinear'], linear: nn.Linear, quantizer: TypeVar('Quantizer'), awq_layout: bool = True, - qparams: Optional[QParams] = None) -> 'WeightOnlyQLinear': + qparams: QParams | None = None) -> 'WeightOnlyQLinear': """Create a WeightOnlyQLinear object from a PyTorch Linear object. Args: diff --git a/lmdeploy/lite/quantization/weight/quant_utils.py b/lmdeploy/lite/quantization/weight/quant_utils.py index 934a569578..1d873b6ed3 100644 --- a/lmdeploy/lite/quantization/weight/quant_utils.py +++ b/lmdeploy/lite/quantization/weight/quant_utils.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional, Sequence, Union +from collections.abc import Sequence import torch @@ -29,8 +29,8 @@ def fast_round_scale_torch(amax: torch.Tensor, fp8_max: torch.Tensor) -> torch.T def _get_quant_scaling(weight: torch.Tensor, fp8_dtype: torch.dtype, - dim: Union[int, Sequence[int]], - scale_fmt: Optional[str] = None): + dim: int | Sequence[int], + scale_fmt: str | None = None): """Get the scaling factor for FP8 quantization.""" finfo = torch.finfo(fp8_dtype) fmax = finfo.max @@ -47,7 +47,7 @@ def _get_quant_scaling(weight: torch.Tensor, def quant_blocked_fp8(weight: torch.Tensor, fp8_dtype: torch.dtype, block_size: int = 128, - scale_fmt: Optional[str] = None): + scale_fmt: str | None = None): """Quantize the weight tensor to blocked FP8 format.""" assert scale_fmt in (None, 'ue8m0'), f'Unsupported scale_fmt: {scale_fmt}' diff --git a/lmdeploy/lite/quantization/weight/quantizer.py b/lmdeploy/lite/quantization/weight/quantizer.py index 0e492ad413..2bbc3fd122 100644 --- a/lmdeploy/lite/quantization/weight/quantizer.py +++ b/lmdeploy/lite/quantization/weight/quantizer.py @@ -1,12 +1,19 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Callable, Dict, Optional +from collections.abc import Callable import torch -from lmdeploy.lite.utils import (QParams, cal_qparams_per_channel_absmax, cal_qparams_per_channel_minmax, - cal_qparams_per_group_absmax, cal_qparams_per_group_minmax, - cal_qparams_per_tensor_absmax, cal_qparams_per_tensor_minmax, precise_round) +from lmdeploy.lite.utils import ( + QParams, + cal_qparams_per_channel_absmax, + cal_qparams_per_channel_minmax, + cal_qparams_per_group_absmax, + cal_qparams_per_group_minmax, + cal_qparams_per_tensor_absmax, + cal_qparams_per_tensor_minmax, + precise_round, +) from lmdeploy.lite.utils.global_avail import GlobalAvailMixin @@ -24,7 +31,7 @@ class WeightQuantizer(GlobalAvailMixin): use min-max scaling. granularity (str): The granularity of quantization. Available options are 'per_channel', 'per_tensor', and 'per_group'. - group_size (Optional[int]): If using 'per_group' quantization, this is + group_size (int | None): If using 'per_group' quantization, this is the number of channels in each group. Example: @@ -41,7 +48,7 @@ class WeightQuantizer(GlobalAvailMixin): quantized_weights = quantizer.fake_quant(weights, qparams) """ - CAL_FUNC_MAP: Dict[str, Dict[str, Callable]] = { + CAL_FUNC_MAP: dict[str, dict[str, Callable]] = { 'per_group': { 'absmax': cal_qparams_per_group_absmax, 'minmax': cal_qparams_per_group_minmax, @@ -56,7 +63,7 @@ class WeightQuantizer(GlobalAvailMixin): }, } - def __init__(self, bits: int, symmetry: bool, granularity: str, group_size: Optional[int] = -1): + def __init__(self, bits: int, symmetry: bool, granularity: str, group_size: int | None = -1): assert bits in [4, 8], "The 'bits' argument must be either 4 or 8." self.bits = bits @@ -95,13 +102,13 @@ def calculate_qparams(self, weight: torch.Tensor) -> QParams: else: return cal_func(weight, self.bits) - def quant(self, weight: torch.Tensor, qparams: Optional[QParams] = None, real: bool = False) -> torch.Tensor: + def quant(self, weight: torch.Tensor, qparams: QParams | None = None, real: bool = False) -> torch.Tensor: """Perform fake quantization on the given weight tensor. Args: weight (torch.Tensor): The weight tensor with shape (out_features, in_features). - qparams (Optional[QParams]): A namedtuple containing 'scales' + qparams (QParams | None): A namedtuple containing 'scales' and 'zero_points'. real (bool): If True, return the tensor with quantized type. diff --git a/lmdeploy/lite/utils/__init__.py b/lmdeploy/lite/utils/__init__.py index 846964fb22..d801a1f447 100644 --- a/lmdeploy/lite/utils/__init__.py +++ b/lmdeploy/lite/utils/__init__.py @@ -1,9 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. from .batch_split import concat_decoder_layer_outputs, split_decoder_layer_inputs -from .cal_qparams import (QParams, cal_qparams_per_channel_absmax, cal_qparams_per_channel_minmax, - cal_qparams_per_group_absmax, cal_qparams_per_group_minmax, cal_qparams_per_tensor_absmax, - cal_qparams_per_tensor_minmax, precise_round) +from .cal_qparams import ( + QParams, + cal_qparams_per_channel_absmax, + cal_qparams_per_channel_minmax, + cal_qparams_per_group_absmax, + cal_qparams_per_group_minmax, + cal_qparams_per_tensor_absmax, + cal_qparams_per_tensor_minmax, + precise_round, +) from .calib_dataloader import get_calib_loaders from .collect import bimap_name_mod, collect_target_modules, collect_target_weights from .global_avail import GlobalAvailMixin diff --git a/lmdeploy/lite/utils/batch_split.py b/lmdeploy/lite/utils/batch_split.py index 5390a7f7d9..f06efaee62 100644 --- a/lmdeploy/lite/utils/batch_split.py +++ b/lmdeploy/lite/utils/batch_split.py @@ -1,22 +1,22 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, List, Tuple, Union +from typing import Any import torch -def split_decoder_layer_inputs(batch_size, *args: Union[torch.Tensor, Any], - **kwargs: Union[torch.Tensor, Any]) -> Tuple[List[List[Any]], List[Dict[str, Any]]]: +def split_decoder_layer_inputs(batch_size, *args: torch.Tensor | Any, + **kwargs: torch.Tensor | Any) -> tuple[list[list[Any]], list[dict[str, Any]]]: """This function splits batched decoder layer inputs into individual elements. Args: - *args (Union[torch.Tensor, Any]): Positional arguments which could + *args (torch.Tensor | Any): Positional arguments which could be a mix of tensors and other types. - **kwargs (Union[torch.Tensor, Any]): Keyword arguments which could + **kwargs (torch.Tensor | Any): Keyword arguments which could be a mix of tensors and other types. Returns: - Tuple[List[List[Any]], List[Dict[str, Any]]]: A tuple containing two + tuple[list[list[Any]], list[dict[str, Any]]]: A tuple containing two lists, one for positional arguments, one for keyword arguments. Each list contains individual elements from the batch. """ @@ -46,7 +46,7 @@ def split_decoder_layer_inputs(batch_size, *args: Union[torch.Tensor, Any], new_kwargs[name] = val[i:i + batch_size] elif isinstance(val, torch.Tensor) and len(val.shape) > 1 and val.size(1) == bs: # qwen2-vl new_kwargs[name] = val[:, i:i + batch_size] - elif name == 'position_embeddings' and isinstance(val, Tuple) and len( + elif name == 'position_embeddings' and isinstance(val, tuple) and len( val[0].shape) > 1 and val[0].size(1) == bs: # qwen2-vl new_kwargs[name] = (val[0][:, i:i + batch_size], val[1][:, i:i + batch_size]) else: @@ -58,12 +58,12 @@ def split_decoder_layer_inputs(batch_size, *args: Union[torch.Tensor, Any], return batch_args, batch_kwargs -def concat_decoder_layer_outputs(batch_outputs: List[Any]) -> Any: +def concat_decoder_layer_outputs(batch_outputs: list[Any]) -> Any: """This function concatenates individual decoder layer outputs into a batched output. Args: - batch_outputs (List[Any]): A list, where each tuple + batch_outputs (list[Any]): A list, where each tuple represents the output from an individual element in the batch. Returns: diff --git a/lmdeploy/lite/utils/cal_qparams.py b/lmdeploy/lite/utils/cal_qparams.py index 38a21b8dd0..33326e13a1 100644 --- a/lmdeploy/lite/utils/cal_qparams.py +++ b/lmdeploy/lite/utils/cal_qparams.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import NamedTuple, Optional +from typing import NamedTuple import torch @@ -8,7 +8,7 @@ class QParams(NamedTuple): """A class to hold the quantization parameters.""" scales: torch.Tensor - zero_points: Optional[torch.Tensor] + zero_points: torch.Tensor | None @torch.no_grad() diff --git a/lmdeploy/lite/utils/collect.py b/lmdeploy/lite/utils/collect.py index 3351bfb5a0..8d421049c1 100644 --- a/lmdeploy/lite/utils/collect.py +++ b/lmdeploy/lite/utils/collect.py @@ -1,13 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List, Tuple, Union from torch import nn def collect_target_modules(model: nn.Module, - target: Union[str, type], - skip_names: List[str] = [], - prefix: str = '') -> Dict[str, nn.Module]: + target: str | type, + skip_names: list[str] = [], + prefix: str = '') -> dict[str, nn.Module]: """Collects the specific target modules from the model. Args: @@ -38,7 +37,7 @@ def _is_target(n, m): return name2mod -def collect_target_weights(model: nn.Module, target: Union[str, type], skip_names: List[str]) -> Dict[str, nn.Module]: +def collect_target_weights(model: nn.Module, target: str | type, skip_names: list[str]) -> dict[str, nn.Module]: """Collects weights of the specific target modules from the model. Args: @@ -61,7 +60,7 @@ def collect_target_weights(model: nn.Module, target: Union[str, type], skip_name return mod2weight -def bimap_name_mod(name2mod_mappings: List[Dict[str, nn.Module]]) -> Tuple[Dict[str, nn.Module], Dict[nn.Module, str]]: +def bimap_name_mod(name2mod_mappings: list[dict[str, nn.Module]]) -> tuple[dict[str, nn.Module], dict[nn.Module, str]]: """Generates bidirectional maps from module names to module instances and vice versa. diff --git a/lmdeploy/lite/utils/global_avail.py b/lmdeploy/lite/utils/global_avail.py index 3b608afa23..462125a676 100644 --- a/lmdeploy/lite/utils/global_avail.py +++ b/lmdeploy/lite/utils/global_avail.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, Union +from typing import Union from torch import nn @@ -7,13 +7,13 @@ class GlobalAvailMixin: """Mixin class to make instances globally available.""" - _instances: Dict[str, Dict[Union[str, nn.Module], 'GlobalAvailMixin']] = {'default': {}} + _instances: dict[str, dict[str | nn.Module, 'GlobalAvailMixin']] = {'default': {}} - def global_available(self, key: Union[str, nn.Module] = 'default', group: str = 'default') -> None: + def global_available(self, key: str | nn.Module = 'default', group: str = 'default') -> None: """Make the instance globally available. Args: - key (Union[str, nn.Module], optional): Key to save the instance. + key (str | nn.Module, optional): Key to save the instance. Defaults to 'default'. group (str, optional): Group to save the instance. Defaults to 'default'. @@ -23,13 +23,13 @@ def global_available(self, key: Union[str, nn.Module] = 'default', group: str = @classmethod def _save_instance(cls, instance: 'GlobalAvailMixin', - key: Union[str, nn.Module] = 'default', + key: str | nn.Module = 'default', group: str = 'default') -> None: """Save the instance. Args: instance (GlobalAvailMixin): Instance to save. - key (Union[str, nn.Module], optional): Key to save the instance. + key (str | nn.Module, optional): Key to save the instance. Defaults to 'default'. group (str, optional): Group to save the instance. Defaults to 'default'. @@ -41,35 +41,35 @@ def _save_instance(cls, cls._instances[group][key] = instance @classmethod - def find(cls, key: Union[str, nn.Module] = 'default', group: str = 'default') -> Union[None, 'GlobalAvailMixin']: + def find(cls, key: str | nn.Module = 'default', group: str = 'default') -> Union[None, 'GlobalAvailMixin']: """Find an instance by its key and group. Args: - key (Union[str, nn.Module], optional): Key of the instance. + key (str | nn.Module, optional): Key of the instance. Defaults to 'default'. group (str, optional): Group of the instance. Defaults to 'default'. Returns: - Union[None, GlobalAvailMixin]: The found instance, or None if + None | GlobalAvailMixin: The found instance, or None if it does not exist. """ return cls._instances.get(group, {}).get(key) @classmethod - def find_group(cls, group: str) -> Dict[Union[str, nn.Module], 'GlobalAvailMixin']: + def find_group(cls, group: str) -> dict[str | nn.Module, 'GlobalAvailMixin']: """Find all instances in a group. Args: group (str): Group of the instances. Returns: - Dict[Union[str, nn.Module], GlobalAvailMixin]: All instances in + dict[str | nn.Module, GlobalAvailMixin]: All instances in the group. """ return cls._instances.get(group, {}) @classmethod - def instances(cls) -> Dict[str, Dict[Union[str, nn.Module], 'GlobalAvailMixin']]: + def instances(cls) -> dict[str, dict[str | nn.Module, 'GlobalAvailMixin']]: """Get all instances.""" return cls._instances diff --git a/lmdeploy/lite/utils/memory_efficient.py b/lmdeploy/lite/utils/memory_efficient.py index ae201f82e2..5c6431bcc4 100644 --- a/lmdeploy/lite/utils/memory_efficient.py +++ b/lmdeploy/lite/utils/memory_efficient.py @@ -4,7 +4,6 @@ import warnings from contextlib import contextmanager from functools import partial -from typing import List import torch from torch import nn @@ -12,7 +11,7 @@ from lmdeploy.lite.defaults import KV_CACHE_SIGNATURE, OFFLOAD_MOD -def extract_return_values(module: nn.Module) -> List[str]: +def extract_return_values(module: nn.Module) -> list[str]: """Extracts return values from given module's forward method. Args: @@ -43,7 +42,7 @@ def find_kv_cache_idx(module: nn.Module) -> int: return signatures.index(KV_CACHE_SIGNATURE) -def find_modules_by_return_value(model: nn.Module, value: str) -> List[nn.Module]: +def find_modules_by_return_value(model: nn.Module, value: str) -> list[nn.Module]: """Finds modules in model that return given value. Args: diff --git a/lmdeploy/logger.py b/lmdeploy/logger.py index b52e586590..d758a69faa 100644 --- a/lmdeploy/logger.py +++ b/lmdeploy/logger.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. # modify from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/logger.py # noqa -from typing import List, Optional from .messages import GenerationConfig from .utils import get_logger @@ -13,11 +12,11 @@ class RequestLogger: exceed a specified maximum length. Args: - max_log_len (Optional[int]): The maximum length of the log entries. + max_log_len (int | None): The maximum length of the log entries. If None, no maximum length is enforced. """ - def __init__(self, max_log_len: Optional[int]) -> None: + def __init__(self, max_log_len: int | None) -> None: self.max_log_len = max_log_len def log_prompt(self, session_id: int, prompt: str) -> None: @@ -31,7 +30,7 @@ def log_prompt(self, session_id: int, prompt: str) -> None: logger.info(f'session={session_id}, ' f'prompt={prompt!r}') - def log_inputs(self, session_id: int, prompt: Optional[str], prompt_token_ids: Optional[List[int]], + def log_inputs(self, session_id: int, prompt: str | None, prompt_token_ids: list[int] | None, gen_config: GenerationConfig, adapter_name: str) -> None: max_log_len = self.max_log_len input_tokens = len(prompt_token_ids) diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index b029d98c26..6dd34c8a6b 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -1,8 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. import enum import time +from collections.abc import Callable from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Literal, Optional +from typing import Any, Literal import torch from pydantic.dataclasses import dataclass as pydantic_dataclass @@ -50,10 +51,10 @@ class GenerationConfig: random_seed: Seed used when sampling a token stop_words: Words that stop generating further tokens bad_words: Words that the engine will never generate - stop_token_ids: List of tokens that stop the generation + stop_token_ids: list of tokens that stop the generation when they are generated. The returned output will not contain the stop tokens. - bad_token_ids: List of tokens that the engine will never + bad_token_ids: list of tokens that the engine will never generate. min_new_tokens: The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt. @@ -109,16 +110,16 @@ class GenerationConfig: repetition_penalty: float = 1.0 ignore_eos: bool = False random_seed: int = None - stop_words: List[str] = None - bad_words: List[str] = None - stop_token_ids: List[int] = None - bad_token_ids: List[int] = None + stop_words: list[str] = None + bad_words: list[str] = None + stop_token_ids: list[int] = None + bad_token_ids: list[int] = None min_new_tokens: int = None skip_special_tokens: bool = True spaces_between_special_tokens: bool = True logprobs: int = None - response_format: Optional[Dict] = None - logits_processors: Optional[List[LogitsProcessor]] = None + response_format: dict | None = None + logits_processors: list[LogitsProcessor] | None = None output_logits: Literal['all', 'generation'] = None output_last_hidden_state: Literal['all', 'generation'] = None include_stop_str_in_output: bool = False @@ -126,7 +127,7 @@ class GenerationConfig: # for disaggregation with_cache: bool = False preserve_cache: bool = False - migration_request: Optional[MigrationRequest] = None + migration_request: MigrationRequest | None = None # router replay return_routed_experts: bool = False @@ -141,7 +142,7 @@ def convert_stop_bad_words_to_ids(self, tokenizer: Tokenizer): def special_word_token_ids(words): if words is not None: - assert isinstance(words, List) and \ + assert isinstance(words, list) and \ all(isinstance(elem, str) for elem in words), \ f'stop_words must be a list of str but got {type(words)}' indexes = [] @@ -178,7 +179,7 @@ def update_from_hf_gen_cfg(self, generation_config, tokenizer_eos_token_id): def __post_init__(self): """Check input validation.""" - assert type(self.n) == int and self.n > 0, 'n is not a positive integer' + assert type(self.n) is int and self.n > 0, 'n is not a positive integer' assert self.top_p >= 0 and self.top_p <= 1 # [0, 1] assert self.top_k >= 0, 'top_k can not be a negative integer' assert self.temperature >= 0 and self.temperature <= 2 # [0,2] @@ -251,7 +252,7 @@ class TurbomindEngineConfig: """ dtype: str = 'auto' - model_format: Optional[str] = None + model_format: str | None = None tp: int = 1 dp: int = 1 cp: int = 1 @@ -264,9 +265,9 @@ class TurbomindEngineConfig: outer_dp_size: int = None nnodes: int = 1 node_rank: int = 0 - dist_init_addr: Optional[str] = None - devices: List[int] = None - session_len: Optional[int] = None + dist_init_addr: str | None = None + devices: list[int] = None + session_len: int | None = None max_batch_size: int = None cache_max_entry_count: float = 0.8 cache_chunk_size: int = -1 @@ -275,16 +276,16 @@ class TurbomindEngineConfig: quant_policy: int = 0 rope_scaling_factor: float = 0.0 use_logn_attn: bool = False - download_dir: Optional[str] = None - revision: Optional[str] = None + download_dir: str | None = None + revision: str | None = None max_prefill_token_num: int = 8192 num_tokens_per_iter: int = 0 max_prefill_iters: int = 1 async_: int = 1 - devices: Optional[List[int]] = None + devices: list[int] | None = None empty_init: bool = False communicator: str = 'nccl' - hf_overrides: Optional[Dict[str, Any]] = None + hf_overrides: dict[str, Any] | None = None enable_metrics: bool = True def __post_init__(self): @@ -388,13 +389,13 @@ class PytorchEngineConfig: block_size: int = 64 num_cpu_blocks: int = 0 num_gpu_blocks: int = 0 - adapters: Dict[str, str] = None + adapters: dict[str, str] = None max_prefill_token_num: int = 4096 thread_safe: bool = False enable_prefix_caching: bool = False device_type: str = 'cuda' eager_mode: bool = False - custom_module_map: Dict[str, str] = None + custom_module_map: dict[str, str] = None download_dir: str = None revision: str = None quant_policy: Literal[0, 4, 8] = 0 @@ -406,7 +407,7 @@ class PytorchEngineConfig: mp_engine_backend: str = 'mp' model_format: str = None enable_metrics: bool = True - hf_overrides: Optional[Dict[str, Any]] = None + hf_overrides: dict[str, Any] | None = None disable_vision_encoder: bool = False logprobs_mode: str = None # router replay @@ -474,23 +475,20 @@ class Response: generate_token_len: the response token length. input_token_len: the input prompt token length. Note that it may contains chat template part. - session_id: the id for running the session. finish_reason: the reason the model stopped generating tokens. This will be 'stop' if the model hit a natural stop point or a provided stop sequence, 'length' if the maximum number of tokens specified in the request was reached. - token_ids:: the output token ids. - logprobs:: the top logprobs for each output - position. - index: it refers to the position index of the input request - batch + token_ids: the output token ids. + logprobs: the top logprobs for each output position. + index: it refers to the position index of the input request batch. """ text: str generate_token_len: int input_token_len: int - finish_reason: Optional[Literal['stop', 'length']] = None - token_ids: List[int] = field(default_factory=list) - logprobs: List[Dict[int, float]] = None + finish_reason: Literal['stop', 'length'] | None = None + token_ids: list[int] = field(default_factory=list) + logprobs: list[dict[int, float]] = None logits: torch.Tensor = None last_hidden_state: torch.Tensor = None index: int = 0 @@ -511,7 +509,7 @@ def _format_none_text_fields(self): fields.append(f'logprobs={self.logprobs}') # Helper function to format tensor information - def _format_tensor(name: str, tensor: Optional[torch.Tensor]) -> List[str]: + def _format_tensor(name: str, tensor: torch.Tensor | None) -> list[str]: if tensor is None: return [f'{name}=None'] try: @@ -580,7 +578,7 @@ class EngineEvent: timestamp: float @classmethod - def new_event(cls, event_type: EventType, timestamp: Optional[float] = None) -> 'EngineEvent': + def new_event(cls, event_type: EventType, timestamp: float | None = None) -> 'EngineEvent': # Timestamps MUST use wall-clock time (time.time()) to maintain consistency # between csrc(std::chrono::system_clock) and python timestamp = time.time() if timestamp is None else timestamp @@ -604,11 +602,11 @@ class RequestMetrics: Attributes: token_timestamp: A wall-clock time when a token is generated. - engine_events: List of engine events during inference. + engine_events: list of engine events during inference. """ token_timestamp: float = 0.0 - engine_events: List[EngineEvent] = field(default_factory=list) - spec_info: Optional[Dict[str, Any]] = None + engine_events: list[EngineEvent] = field(default_factory=list) + spec_info: dict[str, Any] | None = None @dataclass @@ -625,12 +623,12 @@ class EngineOutput: req_metrics: request metrics information """ status: ResponseType - token_ids: List[int] - logprobs: List[Dict[int, float]] = None + token_ids: list[int] + logprobs: list[dict[int, float]] = None logits: torch.Tensor = None last_hidden_state: torch.Tensor = None - cache_block_ids: Optional[List[int]] = None - req_metrics: Optional[RequestMetrics] = None + cache_block_ids: list[int] | None = None + req_metrics: RequestMetrics | None = None routed_experts: torch.Tensor = None diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py index ac63cab1c6..29411c7cd5 100644 --- a/lmdeploy/metrics/loggers.py +++ b/lmdeploy/metrics/loggers.py @@ -4,7 +4,6 @@ import time from abc import ABC, abstractmethod from datetime import datetime -from typing import List import numpy as np @@ -346,11 +345,11 @@ def record_specdecode(self, stats: SpeculativeDecodingStats) -> None: pass -def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: +def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]: """Builds a list of buckets with increasing powers of 10 multiplied by mantissa values until the value exceeds the specified maximum.""" exponent = 0 - buckets: List[int] = [] + buckets: list[int] = [] while True: for m in mantissa_lst: value = m * 10**exponent @@ -361,7 +360,7 @@ def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: exponent += 1 -def build_1_2_5_buckets(max_value: int) -> List[int]: +def build_1_2_5_buckets(max_value: int) -> list[int]: """ Example: >>> build_1_2_5_buckets(100) diff --git a/lmdeploy/metrics/metrics_processor.py b/lmdeploy/metrics/metrics_processor.py index dd8eaeb0c6..9059ca4a1d 100644 --- a/lmdeploy/metrics/metrics_processor.py +++ b/lmdeploy/metrics/metrics_processor.py @@ -11,7 +11,7 @@ @singleton -class MetricsProcessor(): +class MetricsProcessor: """Metrics processor.""" def __init__(self): diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py index 2b9367a94d..bd98bb14a0 100644 --- a/lmdeploy/metrics/stats.py +++ b/lmdeploy/metrics/stats.py @@ -3,7 +3,6 @@ import time from dataclasses import dataclass -from typing import List, Optional import numpy as np @@ -108,7 +107,7 @@ def __repr__(self): f' latest_token_time={self.lastest_token_time:.6f},\n' ')') - def update_from_events(self, engine_events: List[EngineEvent]): + def update_from_events(self, engine_events: list[EngineEvent]): # avoid circular dependency from lmdeploy.messages import EventType @@ -174,9 +173,9 @@ def __init__(self): self.iteration_timestamp = time.time() self.new_generation_tokens = 0 self.prompt_tokens = 0 - self.ttft: Optional[float] = None - self.tpot: Optional[float] = None - self.itl: Optional[float] = None + self.ttft: float | None = None + self.tpot: float | None = None + self.itl: float | None = None def __repr__(self): return ('IterationStats(\n' diff --git a/lmdeploy/model.py b/lmdeploy/model.py index 981e4b80b1..dbd8939ecf 100644 --- a/lmdeploy/model.py +++ b/lmdeploy/model.py @@ -2,7 +2,7 @@ import dataclasses import json import uuid -from typing import List, Literal, Optional, Union +from typing import Literal from mmengine import Registry @@ -18,7 +18,7 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) -def get_text(content: Union[str, List[dict]]): +def get_text(content: str | list[dict]): """Within the OpenAI API, the content field may be specified as either a string or a list of ChatCompletionContentPartTextParam (defined in openai). @@ -36,34 +36,37 @@ class ChatTemplateConfig: """Parameters for chat template. Args: - model_name (str): the name of the deployed model. Determine which chat template will be applied. - All the chat template names: `lmdeploy list` - system (str | None): begin of the system prompt - meta_instruction (str | None): system prompt - eosys (str | None): end of the system prompt - user (str | None): begin of the user prompt - eoh (str | None): end of the user prompt - assistant (str | None): begin of the assistant prompt - eoa (str | None): end of the assistant prompt - tool (str | None): begin of the tool prompt - eotool (str | None): end of the tool prompt - capability: ('completion' | 'infilling' | 'chat' | 'python') = None - """ # noqa: E501 + model_name: the name of the deployed model. Determine which chat template will be applied. + All the chat template names: ``lmdeploy list`` + system: begin of the system prompt. + meta_instruction: system prompt. + eosys: end of the system prompt. + user: begin of the user prompt. + eoh: end of the user prompt. + assistant: begin of the assistant prompt. + eoa: end of the assistant prompt. + tool: begin of the tool prompt. + eotool: end of the tool prompt. + capability: the capability of the model, one of + ``'completion'``, ``'infilling'``, ``'chat'``, ``'python'``. + Default to None. + stop_words: list of stop words. Default to None. + """ model_name: str - model_path: Optional[str] = None - system: Optional[str] = None - meta_instruction: Optional[str] = None - eosys: Optional[str] = None - user: Optional[str] = None - eoh: Optional[str] = None - assistant: Optional[str] = None - eoa: Optional[str] = None - tool: Optional[str] = None - eotool: Optional[str] = None - separator: Optional[str] = None - capability: Optional[Literal['completion', 'infilling', 'chat', 'python']] = None - stop_words: Optional[List[str]] = None + model_path: str | None = None + system: str | None = None + meta_instruction: str | None = None + eosys: str | None = None + user: str | None = None + eoh: str | None = None + assistant: str | None = None + eoa: str | None = None + tool: str | None = None + eotool: str | None = None + separator: str | None = None + capability: Literal['completion', 'infilling', 'chat', 'python'] | None = None + stop_words: list[str] | None = None @property def chat_template(self): @@ -91,12 +94,12 @@ def from_json(cls, file_or_string): """Construct a dataclass instance from a JSON file or JSON string.""" try: # Try to open the input_data as a file path - with open(file_or_string, 'r', encoding='utf-8') as file: + with open(file_or_string, encoding='utf-8') as file: json_data = file.read() except FileNotFoundError: # If it's not a file path, assume it's a JSON string json_data = file_or_string - except IOError: + except OSError: # If it's not a file path and not a valid JSON string, raise error raise ValueError('Invalid input. Must be a file path or a valid JSON string.') json_data = json.loads(json_data) @@ -169,7 +172,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs): chat template. Args: - messages (str | List): user's input prompt + messages (str | list): user's input prompt Returns: str: the concatenated prompt """ @@ -191,7 +194,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs): return ret @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -225,7 +228,7 @@ def __init__(self, **kwargs) @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -270,7 +273,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs): return super().messages2prompt(messages, sequence_start, **kwargs)[:-1] @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -294,7 +297,7 @@ def __init__( super().__init__(meta_instruction=meta_instruction, **kwargs) @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -339,7 +342,7 @@ def __init__( **kwargs) @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -360,7 +363,7 @@ def __init__(self, user='', assistant='', **kwargs): super().__init__(user=user, assistant=assistant, **kwargs) @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -398,7 +401,7 @@ def __init__( **kwargs) @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -443,7 +446,7 @@ def _infill_prompt(self, prompt): return prompt @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -494,7 +497,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs): return ret @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -517,7 +520,7 @@ def __init__(self, user='[INST] ', eoh=' [/INST]', eoa='', **kwargs): super().__init__(user=user, eoh=eoh, eoa=eoa, **kwargs) @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -548,7 +551,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs): return super().messages2prompt(messages, sequence_start, **kwargs)[:-1] @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -590,7 +593,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs): return super().messages2prompt(messages, sequence_start, **kwargs)[:-1] @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -629,7 +632,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs): return super().messages2prompt(messages, sequence_start, **kwargs)[:-1] @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -664,7 +667,7 @@ def __init__(self, **kwargs) @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: """Return the model_name that was registered to MODELS. Args: @@ -787,7 +790,7 @@ def _system_instruction(self): return None, None, [], self.tokenizer.bos_token or '' @classmethod - def match(cls, model_path: str) -> Optional[str]: + def match(cls, model_path: str) -> str | None: try: cls(model_path) except Exception: @@ -795,12 +798,12 @@ def match(cls, model_path: str) -> Optional[str]: return True -def get_chat_template(model_path: str, config: Optional[ChatTemplateConfig] = None) -> BaseChatTemplate: +def get_chat_template(model_path: str, config: ChatTemplateConfig | None = None) -> BaseChatTemplate: """Get the chat template for the model. Args: model_path (str): the model path. - config (Optional[ChatTemplateConfig]): the chat template config. + config (ChatTemplateConfig | None): the chat template config. Returns: BaseChatTemplate: the chat template. """ diff --git a/lmdeploy/pipeline.py b/lmdeploy/pipeline.py index ab1902e134..40238f5653 100644 --- a/lmdeploy/pipeline.py +++ b/lmdeploy/pipeline.py @@ -1,13 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. +from __future__ import annotations + import asyncio import atexit import concurrent.futures import os +from collections.abc import Iterator from contextlib import closing from functools import partial from queue import Queue from threading import Thread -from typing import TYPE_CHECKING, Dict, Iterator, List, Tuple +from typing import TYPE_CHECKING import torch import tqdm @@ -81,8 +84,8 @@ def __init__(self, self.async_engine.start_loop(self.internal_thread.loop, use_async_api=False) def infer(self, - prompts: List[str] | str | List[Dict] | List[List[Dict]] | Tuple | List[Tuple], - gen_config: GenerationConfig | List[GenerationConfig] | None = None, + prompts: list[str] | str | list[dict] | list[list[dict]] | tuple | list[tuple], + gen_config: GenerationConfig | list[GenerationConfig] | None = None, do_preprocess: bool = True, adapter_name: str | None = None, use_tqdm: bool = False, @@ -90,13 +93,16 @@ def infer(self, """Inference prompts. Args: - prompts: Prompts to inference. It can be a single prompt, a list of prompts, a list of tuples, or a tuple. - Tuple can be (prompt, image or [images]) or (image or [images], prompt). - gen_config(GenerationConfig | List[GenerationConfig] | None): Generation configuration(s). - do_preprocess(bool): Whether to pre-process messages. - adapter_name(str | None): Adapter name. - use_tqdm(bool): Whether to use progress bar. - **kwargs(dict): Additional keyword arguments. + prompts: Prompts for inference. It can be a single prompt, a list of prompts, a list of tuples, or a tuple. + tuple can be (prompt, image or [images]) or (image or [images], prompt). + gen_config: Generation configuration(s). + do_preprocess: Whether to pre-process messages. + adapter_name: Adapter name. + use_tqdm: Whether to use progress bar. + **kwargs: Additional keyword arguments. + + Returns: + Response | list[Response]: A single response or a list of responses. """ is_single = self._is_single(prompts) # format prompts to openai message format, which is a list of dicts @@ -126,9 +132,9 @@ def batch_infer(self, *args, **kwargs): return self.infer(*args, **kwargs) def stream_infer(self, - prompts: List[str] | str | List[Dict] | List[List[Dict]] | Tuple | List[Tuple], - sessions: 'Session' | List['Session'] | None = None, - gen_config: GenerationConfig | List[GenerationConfig] | None = None, + prompts: list[str] | str | list[dict] | list[list[dict]] | tuple | list[tuple], + sessions: Session | list[Session] | None = None, + gen_config: GenerationConfig | list[GenerationConfig] | None = None, do_preprocess: bool = True, adapter_name: str | None = None, stream_response: bool = True, @@ -136,20 +142,19 @@ def stream_infer(self, """Stream inference. Args: - prompts(List[str] | str | List[Dict] | List[List[Dict]] | Tuple | List[Tuple]): Prompts to inference. - It can be a single prompt, a list of prompts, a list of tuples, or a tuple. - Tuple can be (prompt, image or [images]) or (image or [images], prompt). - sessions(Session | List[Session] | None): Sessions. Each of which corresponds to a prompt. - gen_config(GenerationConfig | List[GenerationConfig] | None): Generation configuration(s). - do_preprocess(bool): Whether to pre-process messages. - adapter_name(str | None): Adapter name. - stream_response(bool): Whether to stream the response. If True, the generator will stream the response. + prompts: Prompts to inference. It can be a single prompt, a list of prompts, a list of tuples, or a tuple. + tuple can be (prompt, image or [images]) or (image or [images], prompt). + sessions: Sessions. Each of which corresponds to a prompt. + gen_config: Generation configuration(s). + do_preprocess: Whether to pre-process messages. + adapter_name: Adapter name. + stream_response: Whether to stream the response. If True, the generator will stream the response. Otherwise, the generator will run until finish and return the final response. This argument is introduced to support the streaming and non-streaming modes of Pipeline.chat. - **kwargs(dict): Additional keyword arguments. + **kwargs: Additional keyword arguments. Returns: - Generator: A generator that yields the output (i.e. instance of class `Response`) of the inference. + Iterator: A generator that yields the output (i.e. instance of class ``Response``) of the inference. """ prompts = MultimodalProcessor.format_prompts(prompts) requests = self._request_generator(prompts, @@ -167,22 +172,24 @@ def close(self): self.async_engine.close() def chat(self, - prompt: str | Tuple[str, 'Image' | List['Image']], + prompt: str | tuple[str, Image | list[Image]], session=None, gen_config: GenerationConfig | None = None, stream_response=False, adapter_name=None, - **kwargs) -> 'Session' | Iterator: + **kwargs) -> Session | Iterator: """Chat. Args: - prompt (str): prompt - session (Session): the chat session - gen_config (GenerationConfig | None): a instance of - GenerationConfig. Default to None. - stream_response (bool): whether to stream the response. - adapter_name (str): adapter name. - **kwargs (dict): additional keyword arguments. + prompt: prompt string or a tuple of (prompt, image or [images]). + session: the chat session. + gen_config: an instance of GenerationConfig. Default to None. + stream_response: whether to stream the response. + adapter_name: adapter name. + **kwargs: additional keyword arguments. + + Returns: + Session | Iterator: the updated session, or a streaming iterator if stream_response is True. """ if session is None: session = self.session_mgr.get() @@ -227,25 +234,26 @@ def _gen(): return session - def session(self) -> 'Session': + def session(self) -> Session: """Create a new session.""" return self.session_mgr.get() - def get_reward_score(self, input_ids: List) -> List[float]: + def get_reward_score(self, input_ids: list) -> list[float]: """Get reward score. Args: - input_ids(List): a list of token_id or a list of token_id list or token_id tensor - Return: - reward score in a list. If the input_ids is a list of token_id, the return value - is still a list with length 1. + input_ids: a list of token_id or a list of token_id list or token_id tensor. + + Returns: + list[float]: reward score in a list. If the input_ids is a list of token_id, + the return value is still a list with length 1. """ supported_reward_models = ['InternLM2ForRewardModel', 'Qwen2ForRewardModel'] arch = self.async_engine.arch if arch not in supported_reward_models: raise ValueError(f'{arch} is not in reward model list: {supported_reward_models}') - assert isinstance(input_ids, List) - assert all(isinstance(x, int) for x in input_ids) or all(isinstance(x, List) for x in input_ids) + assert isinstance(input_ids, list) + assert all(isinstance(x, int) for x in input_ids) or all(isinstance(x, list) for x in input_ids) # Make input_ids a list of token_id list input_ids = [input_ids] if isinstance(input_ids[0], int) else input_ids logits = self._run(coro=self.async_engine.async_get_logits(input_ids=input_ids)).result() @@ -253,17 +261,17 @@ def get_reward_score(self, input_ids: List) -> List[float]: scores = [x[-1].cpu().item() for x in logits] return scores - def get_ppl(self, input_ids: List[int] | List[List[int]]) -> List[float]: + def get_ppl(self, input_ids: list[int] | list[list[int]]) -> list[float]: """Get perplexity scores given a list of input tokens that have to be of the same length. Args: - input_ids (List[int] | List[List[int]]): the batch of input token ids + input_ids: the batch of input token ids. Returns: - List[float]: A list of perplexity scores. + list[float]: A list of perplexity scores. """ - assert isinstance(input_ids, List) + assert isinstance(input_ids, list) if isinstance(input_ids[0], int): input_ids = [input_ids] assert all(len(_) > 1 for _ in input_ids) @@ -304,8 +312,8 @@ def get_ppl(self, input_ids: List[int] | List[List[int]]) -> List[float]: return output def __call__(self, - prompts: List[str] | str | List[Dict] | List[List[Dict]], - gen_config: GenerationConfig | List[GenerationConfig] | None = None, + prompts: list[str] | str | list[dict] | list[list[dict]], + gen_config: GenerationConfig | list[GenerationConfig] | None = None, **kwargs): return self.infer(prompts, gen_config=gen_config, **kwargs) @@ -328,12 +336,12 @@ async def generate(self, *args, **kwargs): def _is_single(prompts): """Check if prompts is a single prompt.""" return (isinstance(prompts, str) or (isinstance(prompts, tuple) and len(prompts) == 2) - or (isinstance(prompts, list) and len(prompts) > 0 and isinstance(prompts[0], Dict))) + or (isinstance(prompts, list) and len(prompts) > 0 and isinstance(prompts[0], dict))) def _request_generator(self, - prompts: List[str] | str | List[Dict] | List[List[Dict]], - sessions: List['Session'] | 'Session' | None = None, - gen_config: GenerationConfig | List[GenerationConfig] | None = None, + prompts: list[str] | str | list[dict] | list[list[dict]], + sessions: list[Session] | Session | None = None, + gen_config: GenerationConfig | list[GenerationConfig] | None = None, **kwargs): """Generate requests.""" is_single = self._is_single(prompts) @@ -372,7 +380,7 @@ def _get_limiter(self): self.limiter = asyncio.Semaphore(self.backend_config.max_batch_size) return self.limiter - def _infer(self, requests: Iterator[Dict], multiplex: bool, pbar=None, loop=None) -> Iterator[Iterator[Response]]: + def _infer(self, requests: Iterator[dict], multiplex: bool, pbar=None, loop=None) -> Iterator[Iterator[Response]]: async def _sync_resp(g, que: Queue, idx: int, sem: asyncio.Semaphore): async for out in g: @@ -470,13 +478,13 @@ def _get_long_text_ppl(self, session, input_ids, max_input_len): return loss_sum / target_count def _get_ppl(self, - sessions: List['Session'], - input_ids: List[List[int]], + sessions: list[Session], + input_ids: list[list[int]], max_input_len: int, target_ids=None, sequence_start: bool = True, sequence_end: bool = True): - assert (isinstance(input_ids, List) and all(isinstance(_, List) for _ in input_ids)) + assert (isinstance(input_ids, list) and all(isinstance(_, list) for _ in input_ids)) assert target_ids is None or len(target_ids) == len(input_ids) assert len(sessions) == len(input_ids) diff --git a/lmdeploy/profiler.py b/lmdeploy/profiler.py index 51895d8ddf..d687d5a485 100644 --- a/lmdeploy/profiler.py +++ b/lmdeploy/profiler.py @@ -2,7 +2,6 @@ import csv import os import time -from typing import List import numpy as np @@ -30,8 +29,8 @@ def finish(self, status): class Profiler: - def __init__(self, stream_output: bool, percentages: List[int]): - self.sessions: List[Session] = [] + def __init__(self, stream_output: bool, percentages: list[int]): + self.sessions: list[Session] = [] self.stream_output = stream_output self.percentages = percentages @@ -47,11 +46,11 @@ def finish(self): self.elapsed_time = time.perf_counter() - self.t_start def compute_metrics(self): - self.ttfts: List[float] = [] - self.tpots: List[float] = [] - self.e2es: List[float] = [] - self.itls: List[float] = [] - self.tpts: List[int] = [] + self.ttfts: list[float] = [] + self.tpots: list[float] = [] + self.e2es: list[float] = [] + self.itls: list[float] = [] + self.tpts: list[int] = [] self.total_output = 0 self.total_input = 0 self.success = 0 @@ -103,7 +102,7 @@ def compute_metrics(self): self.rps = self.success / self.elapsed_time - def summarize(self, title: str, hyperparams: List = None, header=40, digits=10): + def summarize(self, title: str, hyperparams: list = None, header=40, digits=10): width = header + digits * (1 + len(self.percentages)) diff --git a/lmdeploy/pytorch/adapter/adapter.py b/lmdeploy/pytorch/adapter/adapter.py index 8eb102ba9a..ca93b5fc89 100644 --- a/lmdeploy/pytorch/adapter/adapter.py +++ b/lmdeploy/pytorch/adapter/adapter.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import re -from typing import Dict, Iterable, List, Tuple +from collections.abc import Iterable import torch from torch import nn @@ -70,7 +70,7 @@ def _get_reverse_pack_map(model: nn.Module): return reverse_map -def _get_key_map(reverse_map: Dict[str, str]): +def _get_key_map(reverse_map: dict[str, str]): """Get key map.""" key_map = dict() for name, pack_name in reverse_map.items(): @@ -81,7 +81,7 @@ def _get_key_map(reverse_map: Dict[str, str]): return key_map -def load_lora_weights(model: nn.Module, weights: Iterable[Tuple[str, torch.Tensor]], adapter_id: int): +def load_lora_weights(model: nn.Module, weights: Iterable[tuple[str, torch.Tensor]], adapter_id: int): """Load lora weights.""" from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight prefix_len = len('base_model.model.') @@ -111,7 +111,7 @@ def load_lora_weights(model: nn.Module, weights: Iterable[Tuple[str, torch.Tenso class AdapterManager: """Adapter manager.""" - def __init__(self, adapters: Dict[str, str]): + def __init__(self, adapters: dict[str, str]): if adapters is None: adapters = dict() @@ -122,7 +122,7 @@ def __init__(self, adapters: Dict[str, str]): adapter_id_map = dict(zip(adapter_names, range(len(adapter_names)))) self.adapter_id_map = adapter_id_map - def get_adapter_ids(self, names: List[str]): + def get_adapter_ids(self, names: list[str]): return [self.adapter_id_map[name] for name in names] def num_adapters(self): diff --git a/lmdeploy/pytorch/backends/awq_modules.py b/lmdeploy/pytorch/backends/awq_modules.py index 1a9815c423..02bdcb0069 100644 --- a/lmdeploy/pytorch/backends/awq_modules.py +++ b/lmdeploy/pytorch/backends/awq_modules.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABC, abstractmethod -from typing import Optional import torch @@ -12,7 +11,7 @@ def update_weights(self, qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor, - bias: Optional[torch.Tensor] = None): + bias: torch.Tensor | None = None): """Update weights.""" return qweight, scales, qzeros, bias @@ -20,9 +19,9 @@ def update_weights(self, def forward(self, x, weight: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, - group: Optional[torch.distributed.ProcessGroup] = None): + group: torch.distributed.ProcessGroup | None = None): """forward.""" raise NotImplementedError diff --git a/lmdeploy/pytorch/backends/base.py b/lmdeploy/pytorch/backends/base.py index 603448c6c8..590727ff53 100644 --- a/lmdeploy/pytorch/backends/base.py +++ b/lmdeploy/pytorch/backends/base.py @@ -3,7 +3,6 @@ # https://github.com/vllm-project/vllm/blob/main/vllm/attention/backends/abstract.py from abc import ABC, abstractmethod from enum import Enum, auto -from typing import Tuple import torch @@ -70,7 +69,7 @@ def get_k_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: """Get block shape of k.""" raise NotImplementedError @@ -81,7 +80,7 @@ def get_v_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: """Get block shape of v.""" raise NotImplementedError diff --git a/lmdeploy/pytorch/backends/blockedf8_modules.py b/lmdeploy/pytorch/backends/blockedf8_modules.py index 0d7a5e422a..dd3f360f80 100644 --- a/lmdeploy/pytorch/backends/blockedf8_modules.py +++ b/lmdeploy/pytorch/backends/blockedf8_modules.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABC, abstractmethod -from typing import List, Optional import torch import torch.distributed as dist @@ -10,13 +9,13 @@ class LinearBlockedF8Impl(ABC): """Linear BlockedF8 implementation api.""" def __init__(self): - self.scale_fmt: Optional[str] = None + self.scale_fmt: str | None = None - def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: Optional[torch.Tensor] = None): + def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: torch.Tensor | None = None): """Update weights.""" return weight, scale, bias - def set_scale_fmt(self, scale_fmt: Optional[str]): + def set_scale_fmt(self, scale_fmt: str | None): """Set scale fmt.""" self.scale_fmt = scale_fmt @@ -25,11 +24,11 @@ def forward(self, x, weight: torch.Tensor, scale: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, - group: Optional[dist.ProcessGroup] = None, + group: dist.ProcessGroup | None = None, rank: int = 0, - scatter_size: List[int] = None): + scatter_size: list[int] = None): """forward.""" raise NotImplementedError diff --git a/lmdeploy/pytorch/backends/cuda/attention/default.py b/lmdeploy/pytorch/backends/cuda/attention/default.py index aca4510e5b..26886b5ce5 100644 --- a/lmdeploy/pytorch/backends/cuda/attention/default.py +++ b/lmdeploy/pytorch/backends/cuda/attention/default.py @@ -98,8 +98,12 @@ def __init__( self.logit_softcapping = -1 if self.logit_softcapping <= 0.0 else self.logit_softcapping assert not (alibi and not causal) - from lmdeploy.pytorch.kernels.cuda import (fill_kv_cache, flash_attn_varlen_func, flash_attn_with_kvcache, - flatten_kv_cache) + from lmdeploy.pytorch.kernels.cuda import ( + fill_kv_cache, + flash_attn_varlen_func, + flash_attn_with_kvcache, + flatten_kv_cache, + ) self.fill_kv_cache = fill_kv_cache self.paged_attention_fwd = flash_attn_with_kvcache diff --git a/lmdeploy/pytorch/backends/cuda/attention/mla.py b/lmdeploy/pytorch/backends/cuda/attention/mla.py index 624e8f169f..6381e843ef 100644 --- a/lmdeploy/pytorch/backends/cuda/attention/mla.py +++ b/lmdeploy/pytorch/backends/cuda/attention/mla.py @@ -74,7 +74,7 @@ def update_prefill(self, nsa_indices: torch.Tensor, q_seqlens: torch.Tensor, cu_ return self._update_prefill_func(nsa_indices, q_seqlens, cu_seqlens_k) @staticmethod - @functools.lru_cache(maxsize=None) + @functools.cache def build(): return NSAIndicesUpdater() diff --git a/lmdeploy/pytorch/backends/cuda/awq_modules.py b/lmdeploy/pytorch/backends/cuda/awq_modules.py index 01516a8aca..667ff77f6c 100644 --- a/lmdeploy/pytorch/backends/cuda/awq_modules.py +++ b/lmdeploy/pytorch/backends/cuda/awq_modules.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional import torch @@ -54,9 +53,9 @@ def forward(self, qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, - group: Optional[torch.distributed.ProcessGroup] = None): + group: torch.distributed.ProcessGroup | None = None): """forward.""" out_features = scales.size(1) out = wq_gemm_forward(x, qweight, qzeros, scales, self.w_bit, self.group_size, bias, out_features) diff --git a/lmdeploy/pytorch/backends/cuda/blockedf8_modules.py b/lmdeploy/pytorch/backends/cuda/blockedf8_modules.py index ed8715edca..463d26bb69 100644 --- a/lmdeploy/pytorch/backends/cuda/blockedf8_modules.py +++ b/lmdeploy/pytorch/backends/cuda/blockedf8_modules.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Optional import torch @@ -27,11 +26,11 @@ def forward(self, x, weight: torch.Tensor, scale: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, - group: Optional[dist.ProcessGroup] = None, + group: dist.ProcessGroup | None = None, rank: int = 0, - scatter_size: List[int] = None): + scatter_size: list[int] = None): """forward.""" x_shape = x.shape x = x.flatten(0, -2) @@ -113,11 +112,11 @@ def forward(self, x, weight: torch.Tensor, scale: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, - group: Optional[dist.ProcessGroup] = None, + group: dist.ProcessGroup | None = None, rank: int = 0, - scatter_size: List[int] = None): + scatter_size: list[int] = None): """forward.""" x_shape = x.shape x = x.flatten(0, -2) diff --git a/lmdeploy/pytorch/backends/cuda/graph_runner.py b/lmdeploy/pytorch/backends/cuda/graph_runner.py index 58d093cf9b..9e11444887 100644 --- a/lmdeploy/pytorch/backends/cuda/graph_runner.py +++ b/lmdeploy/pytorch/backends/cuda/graph_runner.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import functools -from typing import Any, Dict, List, Tuple +from typing import Any import torch from torch.profiler import record_function @@ -66,7 +66,7 @@ def __init__( max_tokens: int, num_blocks: int, is_decoding: bool, - pool: Tuple[int, int], + pool: tuple[int, int], model_config: ModelConfig, device: torch.device, decode_query_len: int = 1, @@ -153,7 +153,7 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig, cache_conf self.enable_graph = self.check_enable_graph() self.graph_pool_handle = torch.cuda.graph_pool_handle() - self._runner_map: Dict[Any, CUDASingleGraphRunner] = dict() + self._runner_map: dict[Any, CUDASingleGraphRunner] = dict() self.has_try_compile_model: bool = False # strategy factory @@ -187,7 +187,7 @@ def _get_capture_tokens(self, batch_size: int): return size assert False, f'Unsupported batch_size={batch_size}' - def get_graph_key(self, input_ids: torch.Tensor, position_ids: torch.Tensor, past_key_values: List, + def get_graph_key(self, input_ids: torch.Tensor, position_ids: torch.Tensor, past_key_values: list, attn_metadata: TritonAttentionMetadata, inputs_embeds: torch.Tensor, **kwargs): """Get graph key.""" context = self.ctx_mgr.current_context() @@ -261,7 +261,7 @@ def __call__(self, **kwargs): @record_function('prepare_inputs_for_generation') def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], inputs_embeds: torch.Tensor = None, context: StepContext = None, ): @@ -303,6 +303,6 @@ def update_inputs(self, inputs): dp_meta.sync_tp_size(tp_size) return inputs - def get_capture_batch_sizes(self) -> List[int]: + def get_capture_batch_sizes(self) -> list[int]: """Capture batch sizes.""" return _get_capture_batch_size_impl(self.cache_config.max_batches) diff --git a/lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py b/lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py index 8810f57a7e..997cb286d1 100644 --- a/lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py +++ b/lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Callable, List +from collections.abc import Callable import torch import torch.distributed as dist @@ -53,7 +53,7 @@ def forward(self, down_scale: torch.Tensor, gate_up_bias: torch.Tensor = None, down_bias: torch.Tensor = None, - expert_list: List[int] = None, + expert_list: list[int] = None, act_func: Callable = None): """forward.""" input_size = hidden_states.shape @@ -148,7 +148,7 @@ def forward(self, down_scale: torch.Tensor, gate_up_bias: torch.Tensor = None, down_bias: torch.Tensor = None, - expert_list: List[int] = None, + expert_list: list[int] = None, act_func: Callable = None, **kwargs): """forward.""" diff --git a/lmdeploy/pytorch/backends/cuda/moe/default.py b/lmdeploy/pytorch/backends/cuda/moe/default.py index ceef74a2b1..c47ba6edf8 100644 --- a/lmdeploy/pytorch/backends/cuda/moe/default.py +++ b/lmdeploy/pytorch/backends/cuda/moe/default.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Callable, List, Optional +from collections.abc import Callable import torch @@ -47,7 +47,7 @@ def forward(self, down_weights: torch.Tensor, gate_up_bias: torch.Tensor = None, down_bias: torch.Tensor = None, - expert_list: List[int] = None, + expert_list: list[int] = None, act_func: Callable = None): """forward.""" expert_offset = 0 @@ -103,7 +103,7 @@ def forward( topk_ids: torch.LongTensor, up_weights: torch.Tensor, down_weights: torch.Tensor, - expert_list: List[int] = None, + expert_list: list[int] = None, ): """forward.""" from lmdeploy.pytorch.kernels.cuda.fused_moe_ep import fused_moe_v3 @@ -129,7 +129,7 @@ def dispatch_async(self, x: torch.Tensor, topk_idx: torch.Tensor, topk_weights: torch.Tensor, - num_experts: Optional[int] = None, + num_experts: int | None = None, previous_event=None, async_finish=True): return self.token_dispatcher.dispatch_normal_async(x, topk_idx, topk_weights, num_experts, previous_event, @@ -201,7 +201,7 @@ def dispatch_async_ll( self, hidden_states: torch.Tensor, topk_idx: torch.Tensor, - num_experts: Optional[int] = None, + num_experts: int | None = None, use_fp8: bool = True, async_finish: bool = True, ): @@ -282,7 +282,7 @@ def forward(self, topk_ids: torch.LongTensor, up_weights: torch.Tensor, down_weights: torch.Tensor, - expert_list: List[int] = None): + expert_list: list[int] = None): """forward.""" recv_hidden_states, topk_idx, topk_weights, masked_m, expected_m = dispatch_ll( self.token_dispatcher, @@ -304,7 +304,7 @@ def dispatch_async( self, hidden_states: torch.Tensor, topk_idx: torch.Tensor, - num_experts: Optional[int] = None, + num_experts: int | None = None, use_fp8: bool = False, async_finish: bool = True, ): @@ -406,7 +406,7 @@ def forward(self, down_weights: torch.Tensor, gate_up_bias: torch.Tensor = None, down_bias: torch.Tensor = None, - expert_list: List[int] = None, + expert_list: list[int] = None, act_func: Callable = None): """forward.""" assert act_func is None, 'Activation function is not supported in DeepEP MoE.' diff --git a/lmdeploy/pytorch/backends/cuda/moe/ep_utils.py b/lmdeploy/pytorch/backends/cuda/moe/ep_utils.py index f4c596a99c..68a2889b09 100644 --- a/lmdeploy/pytorch/backends/cuda/moe/ep_utils.py +++ b/lmdeploy/pytorch/backends/cuda/moe/ep_utils.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List import torch from torch import distributed as dist @@ -34,7 +33,7 @@ def split_inputs_by_attn_tp( return hidden_states, topk_weights, topk_ids, split_size -def gather_outputs_by_attn_tp(out_states: torch.Tensor, split_size: List[int]): +def gather_outputs_by_attn_tp(out_states: torch.Tensor, split_size: list[int]): """Gather output by attn tp.""" if split_size is None: return out_states diff --git a/lmdeploy/pytorch/backends/cuda/moe/w8a8.py b/lmdeploy/pytorch/backends/cuda/moe/w8a8.py index 19358f9751..d103d5f270 100644 --- a/lmdeploy/pytorch/backends/cuda/moe/w8a8.py +++ b/lmdeploy/pytorch/backends/cuda/moe/w8a8.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List import torch @@ -43,7 +42,7 @@ def forward(self, gate_up_scale: torch.Tensor, down_weights: torch.Tensor, down_scale: torch.Tensor, - expert_list: List[int] = None): + expert_list: list[int] = None): """forward.""" if isinstance(hidden_states, torch.Tensor): diff --git a/lmdeploy/pytorch/backends/cuda/moe_router.py b/lmdeploy/pytorch/backends/cuda/moe_router.py index c0fbcd2a1a..a56f7d9df2 100644 --- a/lmdeploy/pytorch/backends/cuda/moe_router.py +++ b/lmdeploy/pytorch/backends/cuda/moe_router.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Tuple import torch @@ -57,7 +56,7 @@ def should_enable_custom_kernel(self) -> bool: return True - def forward(self, logits: torch.Tensor, bias: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + def forward(self, logits: torch.Tensor, bias: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: """Router forward.""" if self.enable_custom_kernel: return fused_noaux_tc_routing( diff --git a/lmdeploy/pytorch/backends/cuda/op_backend.py b/lmdeploy/pytorch/backends/cuda/op_backend.py index ff3075f255..853db099d9 100644 --- a/lmdeploy/pytorch/backends/cuda/op_backend.py +++ b/lmdeploy/pytorch/backends/cuda/op_backend.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Tuple import torch @@ -93,7 +92,7 @@ def get_k_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: """Get k block shape.""" return ( block_size, @@ -107,7 +106,7 @@ def get_v_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: """Get v block shape.""" return ( block_size, diff --git a/lmdeploy/pytorch/backends/cuda/qmodules.py b/lmdeploy/pytorch/backends/cuda/qmodules.py index dc61787731..96f73eb28c 100644 --- a/lmdeploy/pytorch/backends/cuda/qmodules.py +++ b/lmdeploy/pytorch/backends/cuda/qmodules.py @@ -1,11 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional import torch import lmdeploy.pytorch.distributed as dist -from lmdeploy.pytorch.kernels.cuda.w8a8_triton_kernels import (matmul_kernel_dynamic_quant, per_token_quant_int8, - rms_norm_dynamic_quant) +from lmdeploy.pytorch.kernels.cuda.w8a8_triton_kernels import ( + matmul_kernel_dynamic_quant, + per_token_quant_int8, + rms_norm_dynamic_quant, +) from lmdeploy.pytorch.models.q_modules import QTensor from ..qmodules import LinearW8A8Builder, LinearW8A8Impl, RMSNormW8A8Builder, RMSNormW8A8Impl @@ -62,9 +64,9 @@ def forward(self, x, weight: torch.Tensor, scale: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, - group: Optional[torch.distributed.ProcessGroup] = None): + group: torch.distributed.ProcessGroup | None = None): """forward.""" if isinstance(x, torch.Tensor): input_quant, input_scale = per_token_quant_int8(x, 1e-7, quant_dtype=self.quant_dtype) diff --git a/lmdeploy/pytorch/backends/cuda/token_dispatcher.py b/lmdeploy/pytorch/backends/cuda/token_dispatcher.py index ce4c9307d6..15176d14e2 100644 --- a/lmdeploy/pytorch/backends/cuda/token_dispatcher.py +++ b/lmdeploy/pytorch/backends/cuda/token_dispatcher.py @@ -9,7 +9,6 @@ except ImportError: use_deepep = False -from typing import List, Optional, Tuple import torch import torch.distributed as dist @@ -137,9 +136,9 @@ def dispatch( hidden_states: torch.Tensor, topk_idx: torch.Tensor, topk_weights: torch.Tensor, - expert_list: List[int] = None, + expert_list: list[int] = None, previous_event=None, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: self.hidden_shape = hidden_states.shape topk_idx = topk_idx.to(torch.int64) ( @@ -218,7 +217,7 @@ def dispatch_normal_async(self, x: torch.Tensor, topk_idx: torch.Tensor, topk_weights: torch.Tensor, - num_experts: Optional[int] = None, + num_experts: int | None = None, previous_event=None, async_finish=True): ( @@ -271,7 +270,7 @@ def combine(self, hidden_states: torch.Tensor) -> torch.Tensor: self.handle = None return hidden_states.view(self.hidden_shape) - def combine_normal(self, x: torch.Tensor, handle: Tuple, previous_event=None): + def combine_normal(self, x: torch.Tensor, handle: tuple, previous_event=None): combined_x, _, event = self.buffer_normal.combine( x, handle, @@ -281,7 +280,7 @@ def combine_normal(self, x: torch.Tensor, handle: Tuple, previous_event=None): ) return combined_x, event - def combine_normal_async(self, x: torch.Tensor, handle: Tuple, previous_event=None, async_finish=True): + def combine_normal_async(self, x: torch.Tensor, handle: tuple, previous_event=None, async_finish=True): combined_x, _, event = self.buffer_normal.combine( x, handle, @@ -307,9 +306,9 @@ def get_number_of_tokens_per_expert(self) -> torch.Tensor: def get_permuted_hidden_states_by_experts(self, hidden_states: torch.Tensor, - topk_idx: Optional[torch.Tensor] = None, - topk_weights: Optional[torch.Tensor] = None, - num_experts: Optional[int] = None) -> torch.Tensor: + topk_idx: torch.Tensor | None = None, + topk_weights: torch.Tensor | None = None, + num_experts: int | None = None) -> torch.Tensor: (dispatched_routing_map, topk_weights) = super().indices_to_multihot(self.topk_idx if topk_idx is None else topk_idx, self.topk_weights if topk_weights is None else topk_weights, @@ -328,10 +327,10 @@ def get_permuted_hidden_states_by_experts(self, def get_restored_hidden_states_by_experts( self, hidden_states: torch.Tensor, - reversed_mapping_for_combine: Optional[torch.Tensor] = None, - hidden_shape_before_permute: Optional[torch.Size] = None, - dispatched_routing_map: Optional[torch.Tensor] = None, - topk_weights: Optional[torch.Tensor] = None, + reversed_mapping_for_combine: torch.Tensor | None = None, + hidden_shape_before_permute: torch.Size | None = None, + dispatched_routing_map: torch.Tensor | None = None, + topk_weights: torch.Tensor | None = None, ) -> torch.Tensor: input_dtype = hidden_states.dtype assert (self.topk_weights.dtype == torch.float32), 'DeepEP only supports float32 probs' @@ -381,7 +380,7 @@ def dispatch( topk_idx: torch.Tensor, topk_weights: torch.Tensor, num_experts: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: topk_idx = topk_idx.to(torch.int64) expected_m = (hidden_states.shape[0] * self.buffer_low_latency.group_size * topk_idx.shape[1] + num_experts) // num_experts @@ -408,7 +407,7 @@ def dispatch_async( self, hidden_states: torch.Tensor, topk_idx: torch.Tensor, - num_experts: Optional[int] = None, + num_experts: int | None = None, use_fp8: bool = True, async_finish: bool = True, ): @@ -429,7 +428,7 @@ def combine( hidden_states: torch.Tensor, topk_idx: torch.Tensor, topk_weights: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, torch.Tensor | None]: combined_hidden_states, event, hook = (self.buffer_low_latency.low_latency_combine( hidden_states, topk_idx, @@ -446,9 +445,9 @@ def combine_async( hidden_states: torch.Tensor, topk_idx: torch.Tensor, topk_weights: torch.Tensor, - handle: Tuple, + handle: tuple, async_finish: bool, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, torch.Tensor | None]: assert topk_idx.dtype == torch.int64 assert topk_weights.dtype == torch.float32 combined_hidden_states, event, hook = self.buffer_low_latency.low_latency_combine( diff --git a/lmdeploy/pytorch/backends/default/awq_modules.py b/lmdeploy/pytorch/backends/default/awq_modules.py index d2253920fa..1837f6d65e 100644 --- a/lmdeploy/pytorch/backends/default/awq_modules.py +++ b/lmdeploy/pytorch/backends/default/awq_modules.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from functools import lru_cache -from typing import Optional import torch @@ -61,9 +60,9 @@ def forward(self, qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, - group: Optional[torch.distributed.ProcessGroup] = None): + group: torch.distributed.ProcessGroup | None = None): """forward.""" out_shape = x.shape[:-1] + (self.out_features, ) input_dtype = x.dtype diff --git a/lmdeploy/pytorch/backends/default/linear.py b/lmdeploy/pytorch/backends/default/linear.py index f766123fff..7823d26566 100644 --- a/lmdeploy/pytorch/backends/default/linear.py +++ b/lmdeploy/pytorch/backends/default/linear.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Optional import torch import torch.distributed as dist @@ -14,11 +13,11 @@ class DefaultLinearImpl(LinearImpl): def forward(self, x, weight: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, group: dist.ProcessGroup = None, rank: int = 0, - scatter_size: List[int] = None): + scatter_size: list[int] = None): """forward.""" out = F.linear(x, weight, bias) if all_reduce: diff --git a/lmdeploy/pytorch/backends/default/moe_router.py b/lmdeploy/pytorch/backends/default/moe_router.py index 34d982a7b0..7ff818965a 100644 --- a/lmdeploy/pytorch/backends/default/moe_router.py +++ b/lmdeploy/pytorch/backends/default/moe_router.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. import functools -from typing import Tuple import torch @@ -52,7 +51,7 @@ def __init__( # n_group self.router_n_groups = router_n_groups - def _forward_router_n_groups(self, scores_for_choice: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + def _forward_router_n_groups(self, scores_for_choice: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: assert scores_for_choice.shape[-1] % self.router_n_groups == 0, \ f'{scores_for_choice.shape[-1]} cannot be divided by {self.router_n_groups}' per_group_top_k = self.top_k // self.router_n_groups @@ -65,7 +64,7 @@ def _forward_router_n_groups(self, scores_for_choice: torch.Tensor) -> Tuple[tor return topk_weight, topk_idx def _forward_default(self, scores: torch.Tensor, scores_for_choice: torch.Tensor, - sequence_length: int) -> Tuple[torch.Tensor, torch.Tensor]: + sequence_length: int) -> tuple[torch.Tensor, torch.Tensor]: group_scores = (scores_for_choice.view(sequence_length, self.n_group, -1).topk(2, dim=-1)[0].sum(dim=-1)) # [n, n_group] group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1] # [n, top_k_group] @@ -90,7 +89,7 @@ def renorm(self, topk_weight: torch.Tensor) -> torch.Tensor: topk_weight = topk_weight * self.routed_scaling_factor return topk_weight - def forward(self, logits: torch.Tensor, bias: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + def forward(self, logits: torch.Tensor, bias: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: """Router forward.""" sequence_length = logits.shape[0] diff --git a/lmdeploy/pytorch/backends/default/op_backend.py b/lmdeploy/pytorch/backends/default/op_backend.py index 84badaa9e3..6bfb9e5934 100644 --- a/lmdeploy/pytorch/backends/default/op_backend.py +++ b/lmdeploy/pytorch/backends/default/op_backend.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Tuple import torch @@ -60,7 +59,7 @@ def get_k_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: """Get block shape of k.""" return ( block_size, @@ -74,7 +73,7 @@ def get_v_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: """Get block shape of v.""" return ( block_size, diff --git a/lmdeploy/pytorch/backends/default/rotary_embedding.py b/lmdeploy/pytorch/backends/default/rotary_embedding.py index e37caa52a7..a20d9ea7dc 100644 --- a/lmdeploy/pytorch/backends/default/rotary_embedding.py +++ b/lmdeploy/pytorch/backends/default/rotary_embedding.py @@ -7,8 +7,15 @@ import torch.nn.functional as F from torch import nn -from ..rotary_embedding import (FopeParameters, Llama3Parameters, LongRoPEScalingParameters, RopeType, - RotaryEmbeddingBuilder, RotaryEmbeddingImpl, YarnParameters) +from ..rotary_embedding import ( + FopeParameters, + Llama3Parameters, + LongRoPEScalingParameters, + RopeType, + RotaryEmbeddingBuilder, + RotaryEmbeddingImpl, + YarnParameters, +) def safe_torch_compile(**compile_kwargs): diff --git a/lmdeploy/pytorch/backends/default/token_dispatcher.py b/lmdeploy/pytorch/backends/default/token_dispatcher.py index f8436f3838..256fe5707c 100644 --- a/lmdeploy/pytorch/backends/default/token_dispatcher.py +++ b/lmdeploy/pytorch/backends/default/token_dispatcher.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Tuple import torch @@ -80,7 +79,7 @@ def preprocess(self, routing_map: torch.Tensor, local_expert_indices) -> torch.T return num_tokens_per_local_expert def dispatch(self, hidden_states: torch.Tensor, topk_ids: torch.Tensor, probs: torch.Tensor, - local_expert_indices) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + local_expert_indices) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: self.hidden_shape = hidden_states.shape self.topk_ids = topk_ids self.routing_map, self.topk_weights = super().indices_to_multihot(topk_ids, probs, self.num_experts) diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py index 484cbd1b72..169da9c150 100644 --- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py @@ -6,7 +6,6 @@ from dataclasses import dataclass from functools import lru_cache from pathlib import Path -from typing import Dict, Tuple import torch import torch.distributed as dist @@ -72,11 +71,11 @@ class DistMeta: class AscendKVQuantMeta: has_set_value: bool = False - quant_meta: Dict = {} + quant_meta: dict = {} @classmethod def set_value(cls, device: str, dtype: torch.dtype, record_file: str, total_layers: int): - with open(record_file, 'r') as file: + with open(record_file) as file: data = file.read() scale_offset_pairs = re.findall(r'scale:\s*([\d\.\-]+)\s*offset:\s*(-?\d+)', data) scale_offset_pairs = [(float(scale), float(offset)) for scale, offset in scale_offset_pairs] @@ -133,7 +132,7 @@ def get_k_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: if SocVersion.is_Ascend910(): return (block_size, num_heads, head_size) else: @@ -145,7 +144,7 @@ def get_v_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: if SocVersion.is_Ascend910(): return (block_size, num_heads, head_size) else: diff --git a/lmdeploy/pytorch/backends/dlinfer/attention.py b/lmdeploy/pytorch/backends/dlinfer/attention.py index 8566187021..78afe49040 100644 --- a/lmdeploy/pytorch/backends/dlinfer/attention.py +++ b/lmdeploy/pytorch/backends/dlinfer/attention.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +from collections.abc import Sequence from dataclasses import dataclass -from typing import Dict, Optional, Sequence from torch import Tensor @@ -10,14 +10,14 @@ @dataclass class DlinferAttentionMetadata(AttentionMetadata): - kv_start_indices: Optional[Tensor] = None + kv_start_indices: Tensor | None = None block_size: int = 64 attention_mask: Sequence[Tensor] = tuple() - is_unpaged_prefill: Optional[bool] = None + is_unpaged_prefill: bool | None = None max_q_seq_len: int = 1 max_kv_seq_len: int = 1 - quant_meta: Dict = None - cu_seq_lens_kv: Optional[Tensor] = None + quant_meta: dict = None + cu_seq_lens_kv: Tensor | None = None class DlinferAttentionImpl(AttentionImpl[DlinferAttentionMetadata]): diff --git a/lmdeploy/pytorch/backends/dlinfer/awq_modules.py b/lmdeploy/pytorch/backends/dlinfer/awq_modules.py index 1ec8bf0072..c9dcc381ec 100644 --- a/lmdeploy/pytorch/backends/dlinfer/awq_modules.py +++ b/lmdeploy/pytorch/backends/dlinfer/awq_modules.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional import torch @@ -22,9 +21,9 @@ def forward(self, qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, - group: Optional[torch.distributed.ProcessGroup] = None): + group: torch.distributed.ProcessGroup | None = None): """forward.""" out = awq_linear(x, qweight, scales, qzeros, bias, all_reduce, self.group_size) return out diff --git a/lmdeploy/pytorch/backends/dlinfer/camb/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/camb/op_backend.py index a4ddc6fc93..18f04de73b 100644 --- a/lmdeploy/pytorch/backends/dlinfer/camb/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/camb/op_backend.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Tuple import torch @@ -26,7 +25,7 @@ def get_k_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return ( num_heads, block_size, @@ -39,7 +38,7 @@ def get_v_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return ( num_heads, block_size, diff --git a/lmdeploy/pytorch/backends/dlinfer/linear.py b/lmdeploy/pytorch/backends/dlinfer/linear.py index fbe717f5c2..e4e22d7f57 100644 --- a/lmdeploy/pytorch/backends/dlinfer/linear.py +++ b/lmdeploy/pytorch/backends/dlinfer/linear.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. import os -from typing import List, Optional import torch import torch.distributed as dist @@ -13,7 +12,7 @@ class DlinferLinearImpl(LinearImpl): """Dlinfer linear implementation api.""" - def update_weights(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None): + def update_weights(self, weight: torch.Tensor, bias: torch.Tensor | None = None): """Update weights.""" if os.getenv('DLINFER_LINEAR_USE_NN_LAYOUT', '0') == '1': weight = weight.data.t().contiguous() @@ -22,11 +21,11 @@ def update_weights(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = No def forward(self, x, weight: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, group: dist.ProcessGroup = None, rank: int = 0, - scatter_size: List[int] = None): + scatter_size: list[int] = None): """forward.""" out = linear(x, weight, bias, False) if all_reduce: diff --git a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py index 3be4ab6f24..8420b159dc 100644 --- a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Tuple import torch @@ -26,7 +25,7 @@ def get_k_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (block_size, num_heads, head_size) @staticmethod @@ -35,7 +34,7 @@ def get_v_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return (block_size, num_heads, head_size) @classmethod diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py index d9c83031e8..e0e5ccb04c 100644 --- a/lmdeploy/pytorch/backends/dlinfer/moe.py +++ b/lmdeploy/pytorch/backends/dlinfer/moe.py @@ -1,12 +1,15 @@ # Copyright (c) OpenMMLab. All rights reserved. import os -from typing import Callable, List +from collections.abc import Callable import torch -from lmdeploy.pytorch.kernels.dlinfer import DlinferMoECommType # noqa: F401 -from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeMetadata # noqa: F401 -from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax +from lmdeploy.pytorch.kernels.dlinfer import ( + DlinferMoECommType, # noqa: F401 + DlinferMoeMetadata, # noqa: F401 + fused_moe, + moe_gating_topk_softmax, +) from lmdeploy.pytorch.model_inputs import get_step_ctx_manager from ..moe import FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder, SoftmaxTopKImpl @@ -85,7 +88,7 @@ def forward(self, down_weights: torch.Tensor, gate_up_bias: torch.Tensor = None, down_bias: torch.Tensor = None, - expert_list: List[int] = None, + expert_list: list[int] = None, act_func: Callable = None): """forward.""" assert gate_up_bias is None diff --git a/lmdeploy/pytorch/backends/dlinfer/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/op_backend.py index 16eb604ccd..b01cc12596 100644 --- a/lmdeploy/pytorch/backends/dlinfer/op_backend.py +++ b/lmdeploy/pytorch/backends/dlinfer/op_backend.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Tuple import torch @@ -73,7 +72,7 @@ def get_k_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return ( block_size, num_heads, @@ -86,7 +85,7 @@ def get_v_block_shape( num_heads: int, head_size: int, dtype: torch.dtype, - ) -> Tuple[int, ...]: + ) -> tuple[int, ...]: return ( block_size, num_heads, diff --git a/lmdeploy/pytorch/backends/dlinfer/qmodules.py b/lmdeploy/pytorch/backends/dlinfer/qmodules.py index fe52dd5f35..af5594245d 100644 --- a/lmdeploy/pytorch/backends/dlinfer/qmodules.py +++ b/lmdeploy/pytorch/backends/dlinfer/qmodules.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. import os -from typing import Optional import torch import torch.distributed as dist @@ -24,7 +23,7 @@ def __init__(self, self.out_dtype = out_dtype self.quant_dtype = quant_dtype - def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: Optional[torch.Tensor] = None): + def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: torch.Tensor | None = None): """Update weights.""" if os.getenv('DLINFER_LINEAR_USE_NN_LAYOUT', '0') == '1': weight = weight.data.t().contiguous() @@ -35,9 +34,9 @@ def forward(self, x, weight: torch.Tensor, scale: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, - group: Optional[torch.distributed.ProcessGroup] = None): + group: torch.distributed.ProcessGroup | None = None): """forward.""" if isinstance(x, torch.Tensor): input_quant, input_scale = dynamic_quant(x, self.quant_dtype) diff --git a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py index bfad3a89a7..677bbdd86c 100644 --- a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py +++ b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py @@ -5,10 +5,20 @@ import torch from torch import nn -from ..default.rotary_embedding import (FopeRotaryEmbeddingImpl, LlamaDynamicNTKScalingRotaryEmbedding, - YarnRotaryEmbeddingImpl) -from ..rotary_embedding import (FopeParameters, Llama3Parameters, LongRoPEScalingParameters, RopeType, - RotaryEmbeddingBuilder, RotaryEmbeddingImpl, YarnParameters) +from ..default.rotary_embedding import ( + FopeRotaryEmbeddingImpl, + LlamaDynamicNTKScalingRotaryEmbedding, + YarnRotaryEmbeddingImpl, +) +from ..rotary_embedding import ( + FopeParameters, + Llama3Parameters, + LongRoPEScalingParameters, + RopeType, + RotaryEmbeddingBuilder, + RotaryEmbeddingImpl, + YarnParameters, +) def _rotary_embedding_fwd(position_ids: torch.Tensor, diff --git a/lmdeploy/pytorch/backends/graph_runner.py b/lmdeploy/pytorch/backends/graph_runner.py index a88872f2bd..72f460ef5b 100644 --- a/lmdeploy/pytorch/backends/graph_runner.py +++ b/lmdeploy/pytorch/backends/graph_runner.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import functools from dataclasses import dataclass -from typing import List import torch @@ -55,7 +54,7 @@ def get_logits(self, hidden_states: torch.Tensor): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], inputs_embeds: torch.Tensor = None, context: StepContext = None, ): @@ -68,7 +67,7 @@ def prepare_inputs_for_generation( def update_model_metas( self, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], inputs_embeds: torch.Tensor = None, context: StepContext = None, ): @@ -100,6 +99,6 @@ def get_meta(self): def update_inputs(self, inputs): return inputs - def get_capture_batch_sizes(self) -> List[int]: + def get_capture_batch_sizes(self) -> list[int]: """Capture batch sizes.""" return _get_capture_batch_size_impl(self.cache_config.max_batches) diff --git a/lmdeploy/pytorch/backends/linear.py b/lmdeploy/pytorch/backends/linear.py index 740b4b7ecc..88d0b150f0 100644 --- a/lmdeploy/pytorch/backends/linear.py +++ b/lmdeploy/pytorch/backends/linear.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABC, abstractmethod -from typing import List, Optional import torch import torch.distributed as dist @@ -9,7 +8,7 @@ class LinearImpl(ABC): """Linear implementation api.""" - def update_weights(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None): + def update_weights(self, weight: torch.Tensor, bias: torch.Tensor | None = None): """Update weights.""" return weight, bias @@ -17,11 +16,11 @@ def update_weights(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = No def forward(self, x, weight: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, group: dist.ProcessGroup = None, rank: int = 0, - scatter_size: List[int] = None): + scatter_size: list[int] = None): """forward.""" raise NotImplementedError diff --git a/lmdeploy/pytorch/backends/moe.py b/lmdeploy/pytorch/backends/moe.py index 5b33b97da7..10a3c5e702 100644 --- a/lmdeploy/pytorch/backends/moe.py +++ b/lmdeploy/pytorch/backends/moe.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import functools from abc import ABC, abstractmethod -from typing import Callable, List, Optional +from collections.abc import Callable import torch import torch.distributed as dist @@ -52,7 +52,7 @@ def forward(self, down_weights: torch.Tensor, gate_up_bias: torch.Tensor = None, down_bias: torch.Tensor = None, - expert_list: List[int] = None, + expert_list: list[int] = None, act_func: Callable = None): """forward.""" raise NotImplementedError @@ -97,7 +97,7 @@ def forward(self, gate_up_scale: torch.Tensor, down_weights: torch.Tensor, down_scale: torch.Tensor, - expert_list: List[int] = None): + expert_list: list[int] = None): """forward.""" raise NotImplementedError @@ -120,7 +120,7 @@ class FusedMoEBlockedF8Impl(ABC): """Fused moe blocked f8 implementation.""" def __init__(self): - self.scale_fmt: Optional[str] = None + self.scale_fmt: str | None = None def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor, gate_up_scale: torch.Tensor, down_scale: torch.Tensor): @@ -131,7 +131,7 @@ def ep_expert_list(self, world_size: int, rank: int): """Experts list of current rank.""" raise NotImplementedError('Not Implemented.') - def set_scale_fmt(self, scale_fmt: Optional[str]): + def set_scale_fmt(self, scale_fmt: str | None): """Set scale fmt.""" self.scale_fmt = scale_fmt @@ -147,7 +147,7 @@ def forward(self, down_scale: torch.Tensor, gate_up_bias: torch.Tensor = None, down_bias: torch.Tensor = None, - expert_list: List[int] = None, + expert_list: list[int] = None, act_func: Callable = None): """forward.""" raise NotImplementedError diff --git a/lmdeploy/pytorch/backends/moe_router.py b/lmdeploy/pytorch/backends/moe_router.py index 87cb90a7cc..e523ee9a56 100644 --- a/lmdeploy/pytorch/backends/moe_router.py +++ b/lmdeploy/pytorch/backends/moe_router.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABC, abstractmethod -from typing import Tuple import torch @@ -9,7 +8,7 @@ class RouterNoauxTCImpl(ABC): """Noaux tc implementation api.""" @abstractmethod - def forward(self, logits: torch.Tensor, bias: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + def forward(self, logits: torch.Tensor, bias: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: """forward.""" raise NotImplementedError diff --git a/lmdeploy/pytorch/backends/qmodules.py b/lmdeploy/pytorch/backends/qmodules.py index 7173fb5f34..4b98da7abb 100644 --- a/lmdeploy/pytorch/backends/qmodules.py +++ b/lmdeploy/pytorch/backends/qmodules.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABC, abstractmethod -from typing import Optional import torch @@ -37,7 +36,7 @@ def build(hidden_size: int, eps: float = 1e-6, quant_dtype: torch.dtype = torch. class LinearW8A8Impl(ABC): """Linear w8a8 implementation api.""" - def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: Optional[torch.Tensor] = None): + def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: torch.Tensor | None = None): """Update weights.""" return weight, scale, bias @@ -46,9 +45,9 @@ def forward(self, x, weight: torch.Tensor, scale: torch.Tensor, - bias: Optional[torch.Tensor] = None, + bias: torch.Tensor | None = None, all_reduce: bool = False, - group: Optional[torch.distributed.ProcessGroup] = None): + group: torch.distributed.ProcessGroup | None = None): """forward.""" raise NotImplementedError diff --git a/lmdeploy/pytorch/backends/rotary_embedding.py b/lmdeploy/pytorch/backends/rotary_embedding.py index 7495e39b75..16b9d7c799 100644 --- a/lmdeploy/pytorch/backends/rotary_embedding.py +++ b/lmdeploy/pytorch/backends/rotary_embedding.py @@ -2,7 +2,6 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum, auto -from typing import List import torch @@ -32,8 +31,8 @@ class YarnParameters: @dataclass class LongRoPEScalingParameters: """Long Ropescaling parameters.""" - short_factor: List[int] - long_factor: List[int] + short_factor: list[int] + long_factor: list[int] original_max_position_embeddings: int long_mscale: float = None short_mscale: float = None diff --git a/lmdeploy/pytorch/backends/token_dispatcher.py b/lmdeploy/pytorch/backends/token_dispatcher.py index 9d831f97ba..34a4136f2c 100644 --- a/lmdeploy/pytorch/backends/token_dispatcher.py +++ b/lmdeploy/pytorch/backends/token_dispatcher.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABC, abstractmethod -from typing import Tuple import torch @@ -63,7 +62,7 @@ def indices_to_multihot(self, topk_ids, topk_weight, num_experts): @abstractmethod def dispatch(self, hidden_states: torch.Tensor, probs: torch.Tensor, topk_ids: torch.Tensor, - local_expert_indices) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + local_expert_indices) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """dispatch.""" raise NotImplementedError diff --git a/lmdeploy/pytorch/check_env/base.py b/lmdeploy/pytorch/check_env/base.py index f40dba96bf..497fd2db54 100644 --- a/lmdeploy/pytorch/check_env/base.py +++ b/lmdeploy/pytorch/check_env/base.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from logging import Logger -from typing import List from lmdeploy.utils import can_colorize, get_logger @@ -23,7 +22,7 @@ def __init__(self, logger: Logger = None): logger = get_logger('lmdeploy') self.logger = logger self._is_passed = False - self._required_checker: List[BaseChecker] = list() + self._required_checker: list[BaseChecker] = list() def get_logger(self): """Get logger.""" diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py index d80045ab92..93f744fc4b 100644 --- a/lmdeploy/pytorch/config.py +++ b/lmdeploy/pytorch/config.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import enum +from collections.abc import Callable from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Literal, Optional, Tuple +from typing import Any, Literal import torch @@ -91,7 +92,7 @@ class CacheConfig: quant_policy: Literal[0, 4, 8] = 0 device_type: str = 'cuda' num_state_caches: int = None - states_shapes: List[Tuple] = field(default_factory=list) + states_shapes: list[tuple] = field(default_factory=list) # reserved blocks for dummy inputs, init to 0 for unit test. num_reserved_gpu_blocks: int = 0 @@ -254,7 +255,7 @@ def _override_hf_config(hf_config: Any, key: str, hf_overrides): _overide_hf_config_cfg(hf_config, key, hf_overrides) -def override_hf_config(hf_config: Any, hf_overrides: Dict[str, Any]): +def override_hf_config(hf_config: Any, hf_overrides: dict[str, Any]): """Override HF config.""" for k, v in hf_overrides.items(): _override_hf_config(hf_config, k, v) @@ -302,7 +303,7 @@ class ModelConfig: num_attention_heads: int num_key_value_heads: int bos_token_id: int - eos_token_id: List[int] + eos_token_id: list[int] head_dim: int k_head_dim: int = None v_head_dim: int = None @@ -312,12 +313,12 @@ class ModelConfig: hf_config: Any = None llm_config: Any = None cogvlm_style: bool = False - custom_module_map: Dict[str, setattr] = None + custom_module_map: dict[str, setattr] = None # flash mla use_flash_mla: bool = False use_mla_fp8_cache: bool = False - mla_index_topk: Optional[int] = None + mla_index_topk: int | None = None # dllm model_paradigm: str = 'ar' @@ -326,10 +327,10 @@ class ModelConfig: # Added for deepseekv3.2 nsa index # caches would be added after kv cache - cache_shapes: List[Tuple[List[int], torch.dtype]] = field(default_factory=list) + cache_shapes: list[tuple[list[int], torch.dtype]] = field(default_factory=list) # added for qwen3_next # could used for any SSM model. - states_shapes: List[Tuple[Tuple[int], torch.dtype]] = field(default_factory=list) + states_shapes: list[tuple[tuple[int], torch.dtype]] = field(default_factory=list) # check env for model-device combination check_env_func: Callable = _default_check_env @@ -352,7 +353,7 @@ def from_pretrained( trust_remote_code: bool = True, dtype: str = 'auto', dist_config: DistConfig = None, - hf_overrides: Dict[str, Any] = None, + hf_overrides: dict[str, Any] = None, is_draft_model: bool = False, spec_method: str = None, model_format: str = None, @@ -366,7 +367,7 @@ def from_pretrained( models defined on the Hub in their own modeling files. dtype (str): user specified data type for model weights and activations. Refer to `PyTorchEngineConfig` for details - hf_overrides (Dict[str, Any]): overrides for the HF config. + hf_overrides (dict[str, Any]): overrides for the HF config. """ from transformers import AutoConfig @@ -488,7 +489,7 @@ class MiscConfig: custom_module_map: str = None empty_init: bool = False model_format: str = None - hf_overrides: Dict[str, Any] = None + hf_overrides: dict[str, Any] = None disable_vision_encoder: bool = False logprobs_mode: str = None dllm_config: DLLMConfig = None @@ -571,10 +572,10 @@ class QuantizationConfig: scale_fmt: str = None bits: int = None group_size: int = None - weight_block_size: Tuple[int] = None + weight_block_size: tuple[int] = None activation_scheme: str = None - ignored_layers: List[str] = field(default_factory=list) - hf_quant_config: Dict[str, Any] = field(default_factory=dict) + ignored_layers: list[str] = field(default_factory=list) + hf_quant_config: dict[str, Any] = field(default_factory=dict) @classmethod def from_config(cls, hf_config: Any): diff --git a/lmdeploy/pytorch/configurations/__init__.py b/lmdeploy/pytorch/configurations/__init__.py index 697df755d6..703fe98b3d 100644 --- a/lmdeploy/pytorch/configurations/__init__.py +++ b/lmdeploy/pytorch/configurations/__init__.py @@ -9,7 +9,7 @@ # load all submodule for loader, module_name, is_pkg in pkgutil.walk_packages(__path__): __all__.append(module_name) - _module = importlib.import_module('{}.{}'.format(__name__, module_name)) + _module = importlib.import_module(f'{__name__}.{module_name}') globals()[module_name] = _module __all__ += ['AutoModelConfigBuilder'] diff --git a/lmdeploy/pytorch/devices/device_manager.py b/lmdeploy/pytorch/devices/device_manager.py index 91fd50fbe2..2a0584c3da 100644 --- a/lmdeploy/pytorch/devices/device_manager.py +++ b/lmdeploy/pytorch/devices/device_manager.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from collections.abc import Callable from dataclasses import dataclass -from typing import Callable from lmdeploy.pytorch.utils import CtxMgrBase, singleton diff --git a/lmdeploy/pytorch/disagg/backend/base.py b/lmdeploy/pytorch/disagg/backend/base.py index 8df1d118c0..c10e712f40 100644 --- a/lmdeploy/pytorch/disagg/backend/base.py +++ b/lmdeploy/pytorch/disagg/backend/base.py @@ -1,8 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import abstractmethod -from lmdeploy.pytorch.disagg.conn.protocol import (DistServeInitRequest, DistServeKVTransferEndpointInfo, - MigrationProtocol) +from lmdeploy.pytorch.disagg.conn.protocol import ( + DistServeInitRequest, + DistServeKVTransferEndpointInfo, + MigrationProtocol, +) from lmdeploy.pytorch.disagg.messages import DistServeRegisterMRMessage, MigrationAssignment diff --git a/lmdeploy/pytorch/disagg/backend/dlslime.py b/lmdeploy/pytorch/disagg/backend/dlslime.py index c8a8454d35..34be90c687 100644 --- a/lmdeploy/pytorch/disagg/backend/dlslime.py +++ b/lmdeploy/pytorch/disagg/backend/dlslime.py @@ -2,7 +2,6 @@ import asyncio import json import os -from typing import Dict from dlslime import RDMAEndpoint, available_nic @@ -10,8 +9,11 @@ from lmdeploy.pytorch.disagg.backend.backend import MIGRATION_BACKENDS from lmdeploy.pytorch.disagg.backend.base import MigrationBackendImpl from lmdeploy.pytorch.disagg.config import DistServeEngineConfig, MigrationBackend -from lmdeploy.pytorch.disagg.conn.protocol import (DistServeInitRequest, DistServeKVTransferEndpointInfo, - MigrationProtocol) +from lmdeploy.pytorch.disagg.conn.protocol import ( + DistServeInitRequest, + DistServeKVTransferEndpointInfo, + MigrationProtocol, +) from lmdeploy.pytorch.disagg.messages import DistServeRegisterMRMessage, MigrationAssignment logger = get_logger('lmdeploy') @@ -25,7 +27,7 @@ def __init__(self, init_request: DistServeInitRequest): self.rank = init_request.rank self.local_engine_config: DistServeEngineConfig = (init_request.local_engine_config) self.remote_engine_config: DistServeEngineConfig = (init_request.remote_engine_config) - self.endpoint: Dict[MigrationProtocol, RDMAEndpoint] = {} + self.endpoint: dict[MigrationProtocol, RDMAEndpoint] = {} if init_request.protocol == MigrationProtocol.RDMA: nics = available_nic() device_name = nics[self.rank % len(nics)] @@ -76,7 +78,7 @@ class DLSlimeBackend(MigrationBackendImpl): """DLSlime Transfer Engine.""" def __init__(self): - self.links: Dict[str, DLSlimeMigrationManagement] = {} + self.links: dict[str, DLSlimeMigrationManagement] = {} def p2p_initialize(self, init_request: DistServeInitRequest): self.links[init_request.remote_engine_id] = DLSlimeMigrationManagement(init_request) diff --git a/lmdeploy/pytorch/disagg/backend/mooncake.py b/lmdeploy/pytorch/disagg/backend/mooncake.py index e4ba7fbd5f..e33056b717 100644 --- a/lmdeploy/pytorch/disagg/backend/mooncake.py +++ b/lmdeploy/pytorch/disagg/backend/mooncake.py @@ -4,13 +4,15 @@ import os import socket import subprocess -from typing import Dict from lmdeploy.pytorch.disagg.backend.backend import MIGRATION_BACKENDS from lmdeploy.pytorch.disagg.backend.base import MigrationBackendImpl from lmdeploy.pytorch.disagg.config import MigrationBackend, MooncakeEngineConfig -from lmdeploy.pytorch.disagg.conn.protocol import (DistServeInitRequest, DistServeKVTransferEndpointInfo, - MigrationProtocol) +from lmdeploy.pytorch.disagg.conn.protocol import ( + DistServeInitRequest, + DistServeKVTransferEndpointInfo, + MigrationProtocol, +) from lmdeploy.pytorch.disagg.messages import DistServeRegisterMRMessage, MigrationAssignment from lmdeploy.utils import get_logger @@ -88,8 +90,8 @@ def __init__(self, init_request: DistServeInitRequest): # Get all RDMA information once during initialization self.ibv_devices = get_rdma_nics() - self.local_kv_table: Dict[str, Dict] = {} - self.remote_kv_table: Dict[str, Dict] = {} + self.local_kv_table: dict[str, dict] = {} + self.remote_kv_table: dict[str, dict] = {} self.remote_url: str = '' # Store remote URL for this connection # Initialize the p2p connection @@ -142,7 +144,7 @@ def register_memory_region(self, register_mr_request: DistServeRegisterMRMessage f'addr: {buffer_addr}, length: {buffer_length} for remote_engine_id {self.remote_engine_id}') @property - def endpoint_info(self) -> Dict: + def endpoint_info(self) -> dict: """Get endpoint information for this connection.""" mr_info = {} @@ -237,7 +239,7 @@ class MooncakeBackend(MigrationBackendImpl): """Mooncake backend that manages multiple migration connections.""" def __init__(self): - self.links: Dict[int, MooncakeMigrationManagement] = {} + self.links: dict[int, MooncakeMigrationManagement] = {} def p2p_initialize(self, init_request: DistServeInitRequest): self.links[init_request.remote_engine_id] = MooncakeMigrationManagement(init_request) diff --git a/lmdeploy/pytorch/disagg/config.py b/lmdeploy/pytorch/disagg/config.py index f4dd002231..a02a831bcd 100644 --- a/lmdeploy/pytorch/disagg/config.py +++ b/lmdeploy/pytorch/disagg/config.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. import enum -from typing import Optional from pydantic import BaseModel @@ -98,7 +97,7 @@ class DistServeEngineConfig(BaseModel): tp_size: int ep_size: int dp_size: int - pp_size: Optional[int] + pp_size: int | None # Rank of DP dp_rank: int diff --git a/lmdeploy/pytorch/disagg/conn/engine_conn.py b/lmdeploy/pytorch/disagg/conn/engine_conn.py index 0312df05bd..191d5690a2 100644 --- a/lmdeploy/pytorch/disagg/conn/engine_conn.py +++ b/lmdeploy/pytorch/disagg/conn/engine_conn.py @@ -1,18 +1,24 @@ # Copyright (c) OpenMMLab. All rights reserved. import asyncio import os -from typing import TYPE_CHECKING, Dict, List +from typing import TYPE_CHECKING from urllib.parse import urlparse import zmq import zmq.asyncio from lmdeploy.logger import get_logger -from lmdeploy.pytorch.disagg.conn.protocol import (DistServeCacheFreeRequest, DistServeConnectionRequest, - DistServeConnectionResponse, DistServeConnectionStatus, - DistServeDropConnectionRequest, DistServeEngineEndpointInfo, - DistServeInitRequest, DistServeInitResponse, - DistServeKVTransferEndpointInfo) +from lmdeploy.pytorch.disagg.conn.protocol import ( + DistServeCacheFreeRequest, + DistServeConnectionRequest, + DistServeConnectionResponse, + DistServeConnectionStatus, + DistServeDropConnectionRequest, + DistServeEngineEndpointInfo, + DistServeInitRequest, + DistServeInitResponse, + DistServeKVTransferEndpointInfo, +) from lmdeploy.pytorch.engine.executor.dist_utils import find_available_port if TYPE_CHECKING: @@ -25,9 +31,9 @@ class EngineP2PConnection: def __init__(self, engine: 'Engine'): self.engine: Engine = engine - self.p2p_conn_ctx: Dict[str, zmq.asyncio.Context] = {} - self.p2p_sender: Dict[str, zmq.asyncio.Socket] = {} - self.p2p_receiver: Dict[str, zmq.asyncio.Socket] = {} + self.p2p_conn_ctx: dict[str, zmq.asyncio.Context] = {} + self.p2p_sender: dict[str, zmq.asyncio.Socket] = {} + self.p2p_receiver: dict[str, zmq.asyncio.Socket] = {} self.use_unique_kvtransfer_engine = os.environ.get('LMDEPLOY_USE_UNIQUE_KVTRANSFER_ENGINE', False) @@ -44,7 +50,7 @@ def p2p_initialize(self, init_request: DistServeInitRequest): self.p2p_sender[init_request.remote_engine_id] = sender self.p2p_receiver[init_request.remote_engine_id] = receiver - kvtransfer_endpoint_info: List[DistServeKVTransferEndpointInfo] = self.engine.executor.p2p_initialize( + kvtransfer_endpoint_info: list[DistServeKVTransferEndpointInfo] = self.engine.executor.p2p_initialize( init_request) return DistServeInitResponse(engine_endpoint_info=DistServeEngineEndpointInfo(zmq_address=zmq_address), diff --git a/lmdeploy/pytorch/disagg/conn/protocol.py b/lmdeploy/pytorch/disagg/conn/protocol.py index aa47789497..2f6f054577 100644 --- a/lmdeploy/pytorch/disagg/conn/protocol.py +++ b/lmdeploy/pytorch/disagg/conn/protocol.py @@ -1,11 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. import enum -from typing import List, Optional from pydantic import BaseModel -from lmdeploy.pytorch.disagg.config import (DistServeEngineConfig, DistServeNVLinkConfig, DistServeRDMAConfig, - DistServeTCPConfig) +from lmdeploy.pytorch.disagg.config import ( + DistServeEngineConfig, + DistServeNVLinkConfig, + DistServeRDMAConfig, + DistServeTCPConfig, +) class MigrationProtocol(enum.Enum): @@ -39,11 +42,11 @@ class DistServeInitRequest(BaseModel): protocol: MigrationProtocol - rank: Optional[int] = None + rank: int | None = None - tcp_config: Optional[DistServeTCPConfig] = None - rdma_config: Optional[DistServeRDMAConfig] = None - nvlink_config: Optional[DistServeNVLinkConfig] = None + tcp_config: DistServeTCPConfig | None = None + rdma_config: DistServeRDMAConfig | None = None + nvlink_config: DistServeNVLinkConfig | None = None class DistServeEngineEndpointInfo(BaseModel): @@ -63,14 +66,14 @@ class DistServeInitResponse(BaseModel): # To ensure generality (where endpoint_info can be initialization information # for different media such as RDMA, NVLink, etc.), we use a string (str) to # store this information. - kvtransfer_endpoint_info: List[DistServeKVTransferEndpointInfo] + kvtransfer_endpoint_info: list[DistServeKVTransferEndpointInfo] class DistServeConnectionRequest(BaseModel): protocol: MigrationProtocol remote_engine_id: str remote_engine_endpoint_info: DistServeEngineEndpointInfo - remote_kvtransfer_endpoint_info: List[DistServeKVTransferEndpointInfo] + remote_kvtransfer_endpoint_info: list[DistServeKVTransferEndpointInfo] class DistServeConnectionResponse(BaseModel): @@ -83,7 +86,7 @@ class MigrationRequest(BaseModel): remote_engine_id: str remote_session_id: int remote_token_id: int - remote_block_ids: List[int] + remote_block_ids: list[int] is_dummy_prefill: bool = False diff --git a/lmdeploy/pytorch/disagg/conn/proxy_conn.py b/lmdeploy/pytorch/disagg/conn/proxy_conn.py index a07d281248..5ab9c2ff06 100644 --- a/lmdeploy/pytorch/disagg/conn/proxy_conn.py +++ b/lmdeploy/pytorch/disagg/conn/proxy_conn.py @@ -3,16 +3,20 @@ import enum import os from collections import defaultdict -from typing import Dict, Set, Tuple import aiohttp import requests from lmdeploy.logger import get_logger from lmdeploy.pytorch.disagg.config import DistServeEngineConfig, EngineRole -from lmdeploy.pytorch.disagg.conn.protocol import (DistServeCacheFreeRequest, DistServeConnectionRequest, - DistServeConnectionResponse, DistServeDropConnectionRequest, - DistServeInitRequest, DistServeInitResponse) +from lmdeploy.pytorch.disagg.conn.protocol import ( + DistServeCacheFreeRequest, + DistServeConnectionRequest, + DistServeConnectionResponse, + DistServeDropConnectionRequest, + DistServeInitRequest, + DistServeInitResponse, +) from lmdeploy.pytorch.disagg.messages import PDConnectionMessage logger = get_logger('lmdeploy') @@ -65,19 +69,19 @@ class PDConnectionPool: def __init__(self): # all prefill and decode instances # TODO (JimyMa): Maybe encoding instances - self.prefill_endpoints: Set[str] = set() - self.decode_endpoints: Set[str] = set() + self.prefill_endpoints: set[str] = set() + self.decode_endpoints: set[str] = set() # Links of PD Connection. - self.pool: Dict[Tuple[str, str], PDConnectionState] = {} + self.pool: dict[tuple[str, str], PDConnectionState] = {} # put migrating session to `self.migration_session_shelf` for increasing fault tolerance # if a session is finished, then pop it from `self.migration_session_shelf` # if a decode instance is disconnected, then gc all blocks of these sessions in prefill instance. - self.migration_session_shelf: Dict[str, Set[int]] = defaultdict(set) + self.migration_session_shelf: dict[str, set[int]] = defaultdict(set) # conn_perform handler queue - self.waiting_conn: asyncio.Queue[Tuple[PDConnectionMessage, asyncio.Event]] = (asyncio.Queue()) + self.waiting_conn: asyncio.Queue[tuple[PDConnectionMessage, asyncio.Event]] = (asyncio.Queue()) # conn Registry Lock self.conn_lock = asyncio.Lock() @@ -112,10 +116,10 @@ def dereg_instance(self, endpoint: str): # TODO(JimyMa): handle side-effect by kvcache migration self.decode_endpoints.remove(endpoint) - def shelf_prefill_session(self, conn_key: Tuple[str, str], session_id: int): + def shelf_prefill_session(self, conn_key: tuple[str, str], session_id: int): self.migration_session_shelf[conn_key].add(session_id) - def unshelf_prefill_session(self, conn_key: Tuple[str, str], session_id: int): + def unshelf_prefill_session(self, conn_key: tuple[str, str], session_id: int): self.migration_session_shelf[conn_key].remove(session_id) async def connect(self, conn_req: PDConnectionMessage): @@ -264,11 +268,11 @@ def is_connected(self, p_url: str, d_url: str): return False return link.status == PDConnectionStatus.Connected - def drop(self, pd_key: Tuple[str, str]): + def drop(self, pd_key: tuple[str, str]): left = pd_key[0] right = pd_key[1] - def cache_free(server_endpoint, cache_free_request: DistServeCacheFreeRequest) -> Dict: + def cache_free(server_endpoint, cache_free_request: DistServeCacheFreeRequest) -> dict: try: requests.post(get_server_api(server_endpoint, 'distserve/free_cache'), json=cache_free_request.model_dump(mode='json')) diff --git a/lmdeploy/pytorch/disagg/messages.py b/lmdeploy/pytorch/disagg/messages.py index cc29c67b8e..e4c7c0cf36 100644 --- a/lmdeploy/pytorch/disagg/messages.py +++ b/lmdeploy/pytorch/disagg/messages.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Optional, Tuple from pydantic import BaseModel @@ -11,7 +10,7 @@ class MigrationExecutionBatch(BaseModel): """Input of the Migration.""" protocol: MigrationProtocol - requests: List[Tuple[str, List[Tuple[int, int]]]] = [] + requests: list[tuple[str, list[tuple[int, int]]]] = [] class AssignmentInstruct(BaseModel): @@ -26,16 +25,16 @@ class MigrationAssignment(BaseModel): """Migration Assignment.""" protocol: MigrationProtocol remote_engine_id: str - batch: List[AssignmentInstruct] + batch: list[AssignmentInstruct] class PDConnectionMessage(BaseModel): p_url: str d_url: str protocol: MigrationProtocol = MigrationProtocol.RDMA - tcp_config: Optional[DistServeTCPConfig] = None - rdma_config: Optional[DistServeRDMAConfig] = None - nvlink_config: Optional[DistServeNVLinkConfig] = None + tcp_config: DistServeTCPConfig | None = None + rdma_config: DistServeRDMAConfig | None = None + nvlink_config: DistServeNVLinkConfig | None = None class DistServeRegisterMRMessage(BaseModel): diff --git a/lmdeploy/pytorch/distributed.py b/lmdeploy/pytorch/distributed.py index ccdaa62060..7203c7883e 100644 --- a/lmdeploy/pytorch/distributed.py +++ b/lmdeploy/pytorch/distributed.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from dataclasses import dataclass from datetime import timedelta -from typing import List, Optional import torch from torch import distributed as dist @@ -18,8 +17,8 @@ class DistGroup: rank: int = 0 cpu_group: dist.ProcessGroup = None gpu_group: dist.ProcessGroup = None - cpu_groups: List[dist.ProcessGroup] = None - gpu_groups: List[dist.ProcessGroup] = None + cpu_groups: list[dist.ProcessGroup] = None + gpu_groups: list[dist.ProcessGroup] = None gpu_gather_group: dist.ProcessGroup = None def close(self): @@ -197,7 +196,7 @@ class DistContext: cpu_group: dist.ProcessGroup = None ep_gpu_group: dist.ProcessGroup = None - ep_gpu_groups: List[dist.ProcessGroup] = None + ep_gpu_groups: list[dist.ProcessGroup] = None dist_config: DistConfig = None @classmethod @@ -303,7 +302,7 @@ def get_world_rank(): return world_size, rank -def get_tp_world_rank(layer_type: Optional[str] = None): +def get_tp_world_rank(layer_type: str | None = None): ctx = get_dist_manager().current_context() if layer_type is None: return ctx.dist_config.tp, ctx.tp_group.rank @@ -416,8 +415,8 @@ def reduce_scatter(output, input_list, op=ReduceOp.SUM, group='tp', async_op=Fal def gather_by_tp_sizes(x: torch.Tensor, - tp_sizes: List[int], - group: Optional[dist.ProcessGroup] = None, + tp_sizes: list[int], + group: dist.ProcessGroup | None = None, async_op: bool = False): """Gather input.""" assert all(size >= 0 for size in tp_sizes), f'Invalid tp sizes: {tp_sizes}' @@ -430,7 +429,7 @@ def gather_by_tp_sizes(x: torch.Tensor, return new_x -def reduce_scatter_by_tp_sizes(out: torch.Tensor, rank: int, tp_sizes: List[int], group: dist.ProcessGroup): +def reduce_scatter_by_tp_sizes(out: torch.Tensor, rank: int, tp_sizes: list[int], group: dist.ProcessGroup): """Reduce scatter.""" attn_tp = get_dist_manager().current_config().attn_tp outs = list(out.split(tp_sizes, -2)) diff --git a/lmdeploy/pytorch/engine/base.py b/lmdeploy/pytorch/engine/base.py index cfe34327ba..590a1ec82a 100644 --- a/lmdeploy/pytorch/engine/base.py +++ b/lmdeploy/pytorch/engine/base.py @@ -1,6 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. -from lmdeploy.pytorch.disagg.conn.protocol import (DistServeConnectionRequest, DistServeDropConnectionRequest, - DistServeInitRequest) +from lmdeploy.pytorch.disagg.conn.protocol import ( + DistServeConnectionRequest, + DistServeDropConnectionRequest, + DistServeInitRequest, +) class EngineBase: diff --git a/lmdeploy/pytorch/engine/cache_engine.py b/lmdeploy/pytorch/engine/cache_engine.py index 302465192f..475343e7d8 100644 --- a/lmdeploy/pytorch/engine/cache_engine.py +++ b/lmdeploy/pytorch/engine/cache_engine.py @@ -2,8 +2,9 @@ # modify from: https://github.com/vllm-project/vllm import json import math +from collections.abc import Sequence from dataclasses import dataclass -from typing import Dict, List, Literal, Optional, Sequence, Tuple +from typing import Literal import torch @@ -11,13 +12,17 @@ from lmdeploy.pytorch.disagg.backend.backend import MIGRATION_BACKENDS from lmdeploy.pytorch.disagg.backend.base import MigrationBackendImpl from lmdeploy.pytorch.disagg.conn.protocol import DistServeInitRequest, DistServeKVTransferEndpointInfo -from lmdeploy.pytorch.disagg.messages import (AssignmentInstruct, DistServeRegisterMRMessage, MigrationAssignment, - MigrationExecutionBatch) +from lmdeploy.pytorch.disagg.messages import ( + AssignmentInstruct, + DistServeRegisterMRMessage, + MigrationAssignment, + MigrationExecutionBatch, +) from lmdeploy.utils import get_logger from ..config import CacheConfig, ModelConfig -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = tuple[torch.Tensor, torch.Tensor] logger = get_logger('lmdeploy') @@ -30,7 +35,7 @@ def round_up(x: int, alignment: int) -> int: @dataclass class CacheDesc: """Cache description.""" - shape: List[int] + shape: list[int] dtype: torch.dtype alignment: int = 256 @@ -98,7 +103,7 @@ def __init__( self.local_gpu_cache = self.allocate_gpu_cache() self.local_cpu_cache = self.allocate_cpu_cache() - self.migration_backend_impl: Optional[MigrationBackendImpl] = None + self.migration_backend_impl: MigrationBackendImpl | None = None # Initialize the stream for caching operations. self.cache_stream = cache_stream or torch.cuda.Stream() @@ -238,7 +243,7 @@ def get_quant_cache_descs(cls, k_cache_desc: CacheDesc, v_cache_desc: CacheDesc, return [key_scale_zero_desc, val_scale_zero_desc] @classmethod - def get_custom_cache_descs(cls, model_config: ModelConfig, cache_config: CacheConfig) -> List[CacheDesc]: + def get_custom_cache_descs(cls, model_config: ModelConfig, cache_config: CacheConfig) -> list[CacheDesc]: """Get custom cache descs.""" if len(model_config.cache_shapes) == 0: return [] @@ -310,7 +315,7 @@ def allocate_cpu_cache(self): return self.local_cpu_cache @staticmethod - def get_custom_cache_shape_impl(num_layers: int, num_blocks: int, block_size: int, shape: List[int]): + def get_custom_cache_shape_impl(num_layers: int, num_blocks: int, block_size: int, shape: list[int]): """Get single block shape.""" return (num_layers, num_blocks, block_size, *shape) @@ -335,13 +340,13 @@ def allocate_custom_cache(self, device: str): return custom_caches @torch.inference_mode() - def _swap(self, src: List[torch.Tensor], dst: List[torch.Tensor], src_to_dst: Dict[int, int]): + def _swap(self, src: list[torch.Tensor], dst: list[torch.Tensor], src_to_dst: dict[int, int]): """Move caches from src memory to dst memory. Args: - src (List[KVCache]): Source cache. - dst (List[KVCache]): Destination cache. - src_to_dst (Dict[int, int]): Map between src and dst. + src (list[KVCache]): Source cache. + dst (list[KVCache]): Destination cache. + src_to_dst (dict[int, int]): Map between src and dst. """ BLOCKS_PER_COPY = 2 num_copy = len(src_to_dst) @@ -357,19 +362,19 @@ def _swap(self, src: List[torch.Tensor], dst: List[torch.Tensor], src_to_dst: Di dcache.index_copy_(1, didx, sdata.to(dcache.device)) self.events.record(stream=self.cache_stream) - def swap_in(self, src_to_dst: Dict[int, int]) -> None: + def swap_in(self, src_to_dst: dict[int, int]) -> None: """Move cache from Host to Device. Args: - src_to_dst (Dict[int, int]): Map between src and dst. + src_to_dst (dict[int, int]): Map between src and dst. """ self._swap([self.full_cpu_cache], [self.full_gpu_cache], src_to_dst) - def swap_out(self, src_to_dst: Dict[int, int]) -> None: + def swap_out(self, src_to_dst: dict[int, int]) -> None: """Move cache from Device to Host. Args: - src_to_dst (Dict[int, int]): Map between src and dst. + src_to_dst (dict[int, int]): Map between src and dst. """ self._swap([self.full_gpu_cache], [self.full_cpu_cache], src_to_dst) @@ -417,7 +422,7 @@ def p2p_initialize(self, migration_init_request: DistServeInitRequest) -> DistSe migration_init_request.remote_engine_id, migration_init_request.protocol))) - def p2p_connect(self, remote_engine_id: str, migration_conn_request: List[DistServeKVTransferEndpointInfo]): + def p2p_connect(self, remote_engine_id: str, migration_conn_request: list[DistServeKVTransferEndpointInfo]): self.migration_backend_impl.p2p_connect(remote_engine_id, migration_conn_request[self.tp_rank]) async def migrate(self, migration_execution_inputs: MigrationExecutionBatch): @@ -434,7 +439,7 @@ def get_assignment_batch(mr_key, block_ids, assignment_len, layer_stride, remote for block_id in block_ids ] - assignment_batch: List[Tuple[str, int, int, int]] = [] # mr_key, target, source, offset + assignment_batch: list[tuple[str, int, int, int]] = [] # mr_key, target, source, offset for migration_exe_req in migration_execution_inputs.requests: remote_engine_id = migration_exe_req[0] blocks_to_migration = migration_exe_req[1] @@ -466,7 +471,7 @@ def __init__(self, cache_config: CacheConfig): device='cuda') @staticmethod - def allocate_caches(num_caches: int, state_shapes: List[Tuple[Tuple[int], torch.dtype]], device: torch.device): + def allocate_caches(num_caches: int, state_shapes: list[tuple[tuple[int], torch.dtype]], device: torch.device): """Allocate cache implement.""" if len(state_shapes) == 0 or num_caches == 0: @@ -492,11 +497,11 @@ def allocate_caches(num_caches: int, state_shapes: List[Tuple[Tuple[int], torch. return mem_pool, caches @staticmethod - def get_cache_state_size(state_shapes: List[Tuple[Tuple[int], torch.dtype]]) -> int: + def get_cache_state_size(state_shapes: list[tuple[tuple[int], torch.dtype]]) -> int: """Get the required cache size of the state cache. Args: - state_shapes (List[Tuple[Tuple[int], torch.dtype]]): The shapes and dtypes of the states. + state_shapes (list[tuple[tuple[int], torch.dtype]]): The shapes and dtypes of the states. Return: int: Required memory size in bytes. diff --git a/lmdeploy/pytorch/engine/config_builder.py b/lmdeploy/pytorch/engine/config_builder.py index b97ec46d32..7c7ab6c3d0 100644 --- a/lmdeploy/pytorch/engine/config_builder.py +++ b/lmdeploy/pytorch/engine/config_builder.py @@ -3,8 +3,14 @@ import os from lmdeploy.messages import PytorchEngineConfig, SpeculativeConfig -from lmdeploy.pytorch.config import (BackendConfig, CacheConfig, DistConfig, MiscConfig, SchedulerConfig, - SpecDecodeConfig) +from lmdeploy.pytorch.config import ( + BackendConfig, + CacheConfig, + DistConfig, + MiscConfig, + SchedulerConfig, + SpecDecodeConfig, +) from lmdeploy.utils import get_logger, get_max_batch_size, get_model diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py index 71f061f5ef..b7d3bd7f84 100644 --- a/lmdeploy/pytorch/engine/engine.py +++ b/lmdeploy/pytorch/engine/engine.py @@ -3,7 +3,7 @@ import gc import os from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Union +from typing import Any import numpy as np import torch @@ -11,8 +11,11 @@ from lmdeploy.messages import PytorchEngineConfig, RequestMetrics, ResponseType, SpeculativeConfig from lmdeploy.pytorch.disagg.config import EngineRole from lmdeploy.pytorch.disagg.conn.engine_conn import EngineP2PConnection -from lmdeploy.pytorch.disagg.conn.protocol import (DistServeConnectionRequest, DistServeDropConnectionRequest, - DistServeInitRequest) +from lmdeploy.pytorch.disagg.conn.protocol import ( + DistServeConnectionRequest, + DistServeDropConnectionRequest, + DistServeInitRequest, +) from lmdeploy.utils import get_logger, get_model from ..adapter.adapter import AdapterManager @@ -28,7 +31,7 @@ logger = get_logger('lmdeploy') -SeqList = List[SchedulerSequence] +SeqList = list[SchedulerSequence] @dataclass @@ -37,7 +40,7 @@ class InferOutput: session_id: int resp: Response - token_ids: Union[np.ndarray, List[int]] + token_ids: np.ndarray | list[int] meta: Any = None finish: bool = False logits: torch.Tensor = None @@ -45,7 +48,7 @@ class InferOutput: # send cache blocks back for migration in Disaggregated LLM Serving # when Prefill Engine is Done. - cache_block_ids: List[int] = None + cache_block_ids: list[int] = None # for logging req_metrics: RequestMetrics = None @@ -229,7 +232,7 @@ def from_pretrained(cls, speculative_config=speculative_config, ) - def _download_adapters(self, adapters: Dict[str, str], engine_config: PytorchEngineConfig): + def _download_adapters(self, adapters: dict[str, str], engine_config: PytorchEngineConfig): """Download adapters.""" download_dir = engine_config.download_dir revision = engine_config.revision @@ -274,7 +277,7 @@ def _get_max_session_len(self): session_len = min(max_tokens, session_len) return session_len - def _on_add_session(self, reqs: List[Request], **kwargs): + def _on_add_session(self, reqs: list[Request], **kwargs): """On add session callback.""" for req in reqs: session_id = req.data['session_id'] @@ -286,7 +289,7 @@ def _on_add_session(self, reqs: List[Request], **kwargs): if resp: self._response(req.resp, resp_type) - def _on_stop_session(self, reqs: List[Request], **kwargs): + def _on_stop_session(self, reqs: list[Request], **kwargs): """On stop session callback.""" for req in reqs: session_id = req.data['session_id'] @@ -305,7 +308,7 @@ def _on_stop_session(self, reqs: List[Request], **kwargs): if resp: self._response(req.resp, resp_type) - def _on_end_session(self, reqs: List[Request], **kwargs): + def _on_end_session(self, reqs: list[Request], **kwargs): """On end session callback.""" for req in reqs: session_id = req.data['session_id'] @@ -321,7 +324,7 @@ def _on_end_session(self, reqs: List[Request], **kwargs): if resp: self._response(req.resp, resp_type) - def _on_add_message(self, reqs: List[Request], **kwargs): + def _on_add_message(self, reqs: list[Request], **kwargs): """On add message callback.""" valid_reqs = [] for req in reqs: @@ -359,7 +362,7 @@ def _on_add_message(self, reqs: List[Request], **kwargs): if len(valid_reqs) > 0: self._add_message(valid_reqs) - def _add_message(self, reqs: List[Request]): + def _add_message(self, reqs: list[Request]): def __update_max_new_tokens(msg): """Update max new tokens.""" @@ -440,7 +443,7 @@ def sleep(self, level: int = 1): """Sleep.""" self.executor.sleep(level) - def wakeup(self, tags: Optional[List[str]] = None): + def wakeup(self, tags: list[str] | None = None): """Wakeup.""" self.executor.wakeup(tags) diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py index c85975425a..c2bbc03420 100644 --- a/lmdeploy/pytorch/engine/engine_instance.py +++ b/lmdeploy/pytorch/engine/engine_instance.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, List +from typing import Any from lmdeploy.messages import EngineOutput, GenerationConfig from lmdeploy.utils import get_logger @@ -11,7 +11,7 @@ logger = get_logger('lmdeploy') -InputMultiModalType = List[Dict[str, Any]] +InputMultiModalType = list[dict[str, Any]] def _check_resp(resp: Response, state: ResponseType, warning_msg: str = None): @@ -125,7 +125,7 @@ def _try_add_session(self, session_id: int): async def async_stream_infer(self, session_id: int, - input_ids: List[int], + input_ids: list[int], gen_config: GenerationConfig = None, multimodal: InputMultiModalType = None, adapter_name: str = None, @@ -134,13 +134,13 @@ async def async_stream_infer(self, Args: session_id (int): The session id. - input_ids (List[int]): The input token ids. + input_ids (list[int]): The input token ids. gen_config (GenerationConfig): The sampling parameters. adapter_name (str): The lora adapter name. Yields: int: Error flags. 0 if success. - List[int]: The streaming output tokens. + list[int]: The streaming output tokens. int: The number of the output tokens. """ if len(input_ids) > self.max_input_len: @@ -210,7 +210,7 @@ async def async_stream_infer(self, async def async_infer(self, session_id: int, - input_ids: List[int] = None, + input_ids: list[int] = None, multimodal: InputMultiModalType = None, gen_config: GenerationConfig = None, **kwargs): @@ -218,12 +218,12 @@ async def async_infer(self, Args: session_id (int): The session id. - input_ids (List[int]): The input token ids. + input_ids (list[int]): The input token ids. gen_config (GenerationConfig): The sampling parameters. Returns: int: Error flags. 0 if success. - List[int]: The streaming output tokens. + list[int]: The streaming output tokens. int: The number of the output tokens. """ async for outputs in self.async_stream_infer(session_id, @@ -239,7 +239,7 @@ async def async_infer(self, def stream_infer(self, session_id: int, - input_ids: List[int], + input_ids: list[int], multimodal: InputMultiModalType = None, gen_config: GenerationConfig = None, adapter_name: str = None, @@ -248,13 +248,13 @@ def stream_infer(self, Args: session_id (int): The session id. - input_ids (List[int]): The input token ids. + input_ids (list[int]): The input token ids. gen_config (GenerationConfig): The sampling parameters. adapter_name (str): The lora adapter name. Yields: int: Error flags. 0 if success. - List[int]: The streaming output tokens. + list[int]: The streaming output tokens. int: The number of the output tokens. """ @@ -276,7 +276,7 @@ def __call_async(): def infer(self, session_id: int, - input_ids: List[int] = None, + input_ids: list[int] = None, multimodal: InputMultiModalType = None, gen_config: GenerationConfig = None, **kwargs): @@ -284,12 +284,12 @@ def infer(self, Args: session_id (int): The session id. - input_ids (List[int]): The input token ids. + input_ids (list[int]): The input token ids. gen_config (GenerationConfig): The sampling parameters. Returns: int: Error flags. 0 if success. - List[int]: The streaming output tokens. + list[int]: The streaming output tokens. int: The number of the output tokens. """ return self.req_sender.run_until_complete( diff --git a/lmdeploy/pytorch/engine/engine_loop.py b/lmdeploy/pytorch/engine/engine_loop.py index d0b6a5e2d6..72956611a4 100644 --- a/lmdeploy/pytorch/engine/engine_loop.py +++ b/lmdeploy/pytorch/engine/engine_loop.py @@ -3,7 +3,7 @@ import logging import time from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple +from typing import TYPE_CHECKING, Any, Optional import numpy as np import torch @@ -83,7 +83,7 @@ class EngineLoopConfig: This config is added for Dependency Injection """ role: EngineRole - num_speculative_tokens: Optional[int] = None + num_speculative_tokens: int | None = None enable_metrics: bool = False enable_transfer_obj_ref: bool = False @@ -123,7 +123,7 @@ def __init__(self, self.engine_conn = engine_conn # tasks and control events - self.tasks: Set[asyncio.Task] = set() + self.tasks: set[asyncio.Task] = set() self.stop_event = asyncio.Event() self.resp_queue = asyncio.Queue() self.forward_event = CounterEvent() @@ -141,7 +141,7 @@ async def preprocess_loop(self): self.has_runable_event.set() @staticmethod - def _log_resps(outputs: List[InferOutput]): + def _log_resps(outputs: list[InferOutput]): """Log resps.""" if logger.level <= logging.DEBUG: session_ids = [out.session_id for out in outputs] @@ -166,7 +166,7 @@ def _send_resp(self, out: InferOutput): logprobs=logprobs)) @staticmethod - def _update_logprobs(step_outputs: List[InferOutput]): + def _update_logprobs(step_outputs: list[InferOutput]): for out in step_outputs: cur_logprobs = out.logprobs if cur_logprobs is None: @@ -183,7 +183,7 @@ def _update_logprobs(step_outputs: List[InferOutput]): logprobs = out.resp.data['logprobs'] logprobs.append(cur_logprobs) - def _send_resps(self, step_outputs: List[InferOutput]): + def _send_resps(self, step_outputs: list[InferOutput]): """Send response callback.""" self._log_resps(step_outputs) self._update_logprobs(step_outputs) @@ -218,7 +218,7 @@ def _make_infer_outputs( ): """Make infer output.""" - def __get_logit(msg, logits: torch.Tensor, seq_length: List[int], idx: int): + def __get_logit(msg, logits: torch.Tensor, seq_length: list[int], idx: int): logit = logits.split(seq_length)[idx] if len(msg.all_logits) > 0: # for chunked long context @@ -253,7 +253,7 @@ def __get_logit(msg, logits: torch.Tensor, seq_length: List[int], idx: int): delta=delta) # generate output - outputs: Dict[int, InferOutput] = dict() + outputs: dict[int, InferOutput] = dict() for idx, msg in enumerate(running): if not is_run[idx]: continue @@ -310,7 +310,7 @@ async def _main_loop_try_send_next_inputs(self): async def _main_loop_get_outputs( self, running: 'SeqList', - forward_inputs: Dict[str, Any], + forward_inputs: dict[str, Any], ): """Get outputs and prefetch.""" model_inputs = forward_inputs['inputs'] @@ -363,7 +363,7 @@ async def __no_running_warning(): has_runable_event.set() def update_running_migration(self, running: 'SeqList', next_token_ids: np.ndarray, stopped: torch.Tensor, - model_metas: List[Dict[str, Any]]): + model_metas: list[dict[str, Any]]): """Update scheduler.""" if model_metas is None: model_metas = [None] * len(running) @@ -386,7 +386,7 @@ async def _migration_loop_migrate(self, migration_ready: 'SeqList'): if msg.migration_request.is_dummy_prefill: continue - migration_execution_requests: List[Tuple[int, List[Tuple[int, int]]]] = [] + migration_execution_requests: list[tuple[int, list[tuple[int, int]]]] = [] migration_request = msg.migration_request prefill_block_ids = migration_request.remote_block_ids decode_block_ids = list(self.scheduler.block_manager.get_block_table(msg=msg)) @@ -409,7 +409,7 @@ async def _migration_loop_migrate(self, migration_ready: 'SeqList'): async def _migration_loop_get_outputs(self, migration_ready: 'SeqList'): """Migration loop get outputs.""" - outputs: Dict[int, InferOutput] = dict() + outputs: dict[int, InferOutput] = dict() for _, msg in enumerate(migration_ready): session_id = msg.session_id msg.resp.type = ResponseType.SUCCESS diff --git a/lmdeploy/pytorch/engine/executor/__init__.py b/lmdeploy/pytorch/engine/executor/__init__.py index 96c624efb1..a2f0e7e91d 100644 --- a/lmdeploy/pytorch/engine/executor/__init__.py +++ b/lmdeploy/pytorch/engine/executor/__init__.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from logging import Logger -from typing import Dict from lmdeploy.pytorch import envs from lmdeploy.pytorch.config import BackendConfig, CacheConfig, DistConfig, MiscConfig, ModelConfig, SpecDecodeConfig @@ -59,7 +58,7 @@ def build_executor( backend_config: BackendConfig, dist_config: DistConfig, misc_config: MiscConfig, - adapters: Dict[str, str] = None, + adapters: dict[str, str] = None, device_type: str = 'cuda', distributed_executor_backend: str = None, dtype: str = 'auto', diff --git a/lmdeploy/pytorch/engine/executor/base.py b/lmdeploy/pytorch/engine/executor/base.py index b5d560e5a1..77873c2ac5 100644 --- a/lmdeploy/pytorch/engine/executor/base.py +++ b/lmdeploy/pytorch/engine/executor/base.py @@ -2,7 +2,7 @@ # Inspired by vLLM: https://github.com/vllm-project/vllm import asyncio import contextlib -from typing import Any, Dict, List, Optional +from typing import Any from lmdeploy.pytorch.config import BackendConfig, CacheConfig, DistConfig, MiscConfig, ModelConfig, SpecDecodeConfig from lmdeploy.pytorch.disagg.conn.protocol import DistServeInitRequest, DistServeKVTransferEndpointInfo @@ -23,7 +23,7 @@ def __init__(self, backend_config: BackendConfig, dist_config: DistConfig, misc_config: MiscConfig, - adapters: Dict[str, str] = None, + adapters: dict[str, str] = None, specdecode_config: SpecDecodeConfig = None, device_type: str = 'cuda'): """Initialize Executor.""" @@ -78,7 +78,7 @@ async def sleep(self, level: int = 1): """Sleep.""" raise NotImplementedError('Not Implemented.') - def wakeup(self, tags: Optional[List[str]] = None): + def wakeup(self, tags: list[str] | None = None): """Wakeup.""" raise NotImplementedError('Not Implemented.') @@ -120,7 +120,7 @@ def p2p_initialize(self, remote_engine_config: DistServeInitRequest): """Init rdma link.""" raise NotImplementedError('Not implemented') - def p2p_connect(self, conn_request: List[DistServeKVTransferEndpointInfo]): + def p2p_connect(self, conn_request: list[DistServeKVTransferEndpointInfo]): """rdma_connect.""" raise NotImplementedError('Not Implemented') diff --git a/lmdeploy/pytorch/engine/executor/base_worker.py b/lmdeploy/pytorch/engine/executor/base_worker.py index 40ace7defc..d78ab9867f 100644 --- a/lmdeploy/pytorch/engine/executor/base_worker.py +++ b/lmdeploy/pytorch/engine/executor/base_worker.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import asyncio import gc -from typing import Any, Dict, List, Optional +from typing import Any from lmdeploy.pytorch.backends.selector import get_backend from lmdeploy.pytorch.config import BackendConfig, CacheConfig, DistConfig, MiscConfig, ModelConfig, SpecDecodeConfig @@ -28,7 +28,7 @@ def __init__( model_config: ModelConfig, dist_config: DistConfig, misc_config: MiscConfig, - adapters: Dict[str, str] = None, + adapters: dict[str, str] = None, device_type: str = 'cuda', log_level: int = 30, specdecode_config: SpecDecodeConfig = None, @@ -66,7 +66,7 @@ def init_process_group(self, rank: int, master_addr: str = None, master_port: st ccl_backend = get_backend(self.device_type).ccl_backend() self.dist_ctx = DistContext.build(self.rank, self.dist_config, ccl_backend) - def pack_output(self, output: Dict): + def pack_output(self, output: dict): """Pack output.""" return output @@ -123,7 +123,7 @@ async def sleep(self, level: int = 1): """Sleep.""" await self.model_agent.sleep(level) - def wakeup(self, tags: Optional[List[str]] = None): + def wakeup(self, tags: list[str] | None = None): """Wakeup.""" self.model_agent.wakeup(tags) @@ -175,7 +175,7 @@ def release(self): def p2p_initialize(self, init_request: DistServeInitRequest): return self.model_agent.cache_engine.p2p_initialize(init_request) - def p2p_connect(self, remote_engine_id: str, conn_request: List[DistServeKVTransferEndpointInfo]): + def p2p_connect(self, remote_engine_id: str, conn_request: list[DistServeKVTransferEndpointInfo]): return self.model_agent.cache_engine.p2p_connect(remote_engine_id, conn_request) async def migrate(self, inputs: MigrationExecutionBatch): diff --git a/lmdeploy/pytorch/engine/executor/mp_executor.py b/lmdeploy/pytorch/engine/executor/mp_executor.py index e53d39ef30..9f457eec3c 100644 --- a/lmdeploy/pytorch/engine/executor/mp_executor.py +++ b/lmdeploy/pytorch/engine/executor/mp_executor.py @@ -8,7 +8,7 @@ import struct from contextlib import asynccontextmanager, contextmanager from multiprocessing.context import SpawnContext -from typing import Any, Dict, List, Tuple +from typing import Any import torch import torch.distributed as dist @@ -224,7 +224,7 @@ def __init__(self, backend_config: BackendConfig, dist_config: DistConfig, misc_config: MiscConfig, - adapters: Dict[str, str] = None, + adapters: dict[str, str] = None, specdecode_config: SpecDecodeConfig = None, device_type: str = 'cuda'): """Initialize Executor.""" @@ -247,8 +247,8 @@ def __init__(self, self.comm_buf_name = self.comm_buf.name() logger.info('Creating processes.') - self.procs: List[ExecutorProc] = [] - self.ret_bufs: List[SharedBuffer] = [] + self.procs: list[ExecutorProc] = [] + self.ret_bufs: list[SharedBuffer] = [] for proc_id in range(self.world_size): proc = ExecutorProc(proc_id=proc_id, mp_ctx=mp_ctx) @@ -285,8 +285,8 @@ def signal_handler(signum, frame): def collective_rpc(self, method: str, - args: Tuple[Any] = None, - kwargs: Dict[str, Any] = None, + args: tuple[Any] = None, + kwargs: dict[str, Any] = None, receiver_mask: int = 0xff, return_mask: int = 0xff): """Collective rpc.""" @@ -314,8 +314,8 @@ def collective_rpc(self, async def collective_rpc_async(self, method: str, - args: Tuple[Any] = None, - kwargs: Dict[str, Any] = None, + args: tuple[Any] = None, + kwargs: dict[str, Any] = None, receiver_mask: int = 0xff, return_mask: int = 0xff): """Collective rpc.""" @@ -433,7 +433,7 @@ def __init__( dist_config: DistConfig, misc_config: MiscConfig, specdecode_config: SpecDecodeConfig = None, - adapters: Dict[str, str] = None, + adapters: dict[str, str] = None, device_type: str = 'cuda', log_level: int = 30, ): @@ -496,7 +496,7 @@ def _main_loop( dist_config: DistConfig, misc_config: MiscConfig, specdecode_config: SpecDecodeConfig = None, - adapters: Dict[str, str] = None, + adapters: dict[str, str] = None, device_type: str = 'cuda', log_level: int = 30, ): @@ -554,7 +554,7 @@ def handle_sigterm(signum, frame): dist.destroy_process_group() @staticmethod - async def _task_wrapper(func, args: List, kwargs: Dict, need_return: bool, ret_buf: SharedBuffer): + async def _task_wrapper(func, args: list, kwargs: dict, need_return: bool, ret_buf: SharedBuffer): ret = await func(*args, **kwargs) if need_return: await ret_buf.send_async(ret) diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py index 52eb463e15..96bd489fd1 100644 --- a/lmdeploy/pytorch/engine/executor/ray_executor.py +++ b/lmdeploy/pytorch/engine/executor/ray_executor.py @@ -3,7 +3,7 @@ import contextlib import json import os -from typing import Any, Dict, List, Optional, Tuple +from typing import Any import ray import ray.exceptions @@ -50,12 +50,12 @@ def get_ascend_device_rank_mapping(master_addr): rank_table_file = _envs.ascend_rank_table_file if not rank_table_file: raise ValueError('ASCEND_RANK_TABLE_FILE_PATH is not set') - with open(rank_table_file, 'r') as f: + with open(rank_table_file) as f: rank_table = json.load(f) try: assert master_addr == rank_table['server_list'][0]['server_id'], 'Master address does not match rank table' - rank_mapping: Dict[int, int] = {} - worker_ip_by_rank: Dict[int, str] = {} + rank_mapping: dict[int, int] = {} + worker_ip_by_rank: dict[int, str] = {} for server in rank_table['server_list']: node_ip = server['server_id'] for idx, device in enumerate(server['device']): @@ -82,7 +82,7 @@ def get_ascend_device_rank_mapping(master_addr): return rank_mapping, worker_ips, envs -def _update_env_cuda_alloc_conf(env_vars: Dict): +def _update_env_cuda_alloc_conf(env_vars: dict): """Update runtime env for CUDA alloc conf.""" cuda_alloc_conf = os.getenv('PYTORCH_CUDA_ALLOC_CONF', None) if cuda_alloc_conf is None: @@ -105,17 +105,17 @@ def _update_env_cuda_alloc_conf(env_vars: Dict): env_vars['PYTORCH_CUDA_ALLOC_CONF'] = cuda_alloc_conf -def _update_runtime_envs(runtime_env: Dict): +def _update_runtime_envs(runtime_env: dict): """Update runtime envs.""" new_envs = _envs.get_all_envs() - env_vars: Dict = runtime_env.get('env_vars', {}) + env_vars: dict = runtime_env.get('env_vars', {}) env_vars.update(new_envs) _update_env_cuda_alloc_conf(env_vars) runtime_env['env_vars'] = env_vars return runtime_env -def _update_runtime_env_nsys(runtime_env: Dict): +def _update_runtime_env_nsys(runtime_env: dict): """Update runtime env for nsys.""" nsight_env = { 't': 'cuda,cudnn,cublas,nvtx', @@ -163,7 +163,7 @@ def __init__( model_config: ModelConfig, dist_config: DistConfig, misc_config: MiscConfig, - adapters: Dict[str, str] = None, + adapters: dict[str, str] = None, device_type: str = 'cuda', dtype: str = 'auto', log_level: int = 30, @@ -191,7 +191,7 @@ def set_device(self, local_rank): """Set worker local rank.""" torch.cuda.set_device(local_rank) - def set_env(self, envs: Dict[str, str]): + def set_env(self, envs: dict[str, str]): for key, value in envs.items(): os.environ[key] = value @@ -211,7 +211,7 @@ def warmup_dist(self): tmp = torch.empty((1, ), device='cuda') all_reduce(tmp, group=group) - def pack_output(self, output: Dict): + def pack_output(self, output: dict): """Pack output.""" return output.to_numpy() @@ -239,7 +239,7 @@ def __init__( backend_config: BackendConfig, dist_config: DistConfig, misc_config: MiscConfig, - adapters: Dict[str, str] = None, + adapters: dict[str, str] = None, device_type: str = 'cuda', dtype: str = 'auto', specdecode_config: SpecDecodeConfig = None, @@ -311,8 +311,8 @@ def __init__( def collective_rpc(self, method: str, - args: Tuple[Any] = None, - kwargs: Dict[str, Any] = None, + args: tuple[Any] = None, + kwargs: dict[str, Any] = None, timeout: float = None): """Collective rpc.""" if args is None: @@ -357,7 +357,7 @@ def sleep(self, level: int = 1): """Sleep.""" self.collective_rpc('sleep', (level, )) - def wakeup(self, tags: Optional[List[str]] = None): + def wakeup(self, tags: list[str] | None = None): """Wakeup.""" if tags is None or 'kv_cache' in tags: self.update_configs() @@ -514,11 +514,11 @@ def remote_log(self, msg: str): handle = ray.get(handle_ref) ray.get(self.workers[0].remote_log_end.remote(handle)) - def _sort_workers(self, driver_ip: str, workers: List[RayWorkerWrapper]): + def _sort_workers(self, driver_ip: str, workers: list[RayWorkerWrapper]): """Sort workers by ip.""" worker_ips = ray.get([worker.get_node_ip.remote() for worker in workers]) - ip_counts: Dict[str, int] = {} + ip_counts: dict[str, int] = {} for ip in worker_ips: ip_counts[ip] = ip_counts.get(ip, 0) + 1 @@ -544,7 +544,7 @@ def sort_by_driver_then_worker_ip(item): workers = [item[0] for item in sorted_worker_ip_map] return workers - def _sort_workers_by_ip(self, ips, workers: List[RayWorkerWrapper]): + def _sort_workers_by_ip(self, ips, workers: list[RayWorkerWrapper]): worker_ips = ray.get([worker.get_node_ip.remote() for worker in workers]) if len(ips) != len(workers): @@ -661,7 +661,7 @@ def _init_ascend_distributed_environment(self, driver_ip): def p2p_initialize(self, init_request: DistServeInitRequest): return self.collective_rpc('p2p_initialize', (init_request, )) - def p2p_connect(self, remote_engine_id: str, conn_request: List[DistServeKVTransferEndpointInfo]): + def p2p_connect(self, remote_engine_id: str, conn_request: list[DistServeKVTransferEndpointInfo]): """Rdma connect.""" return self.collective_rpc('p2p_connect', ( remote_engine_id, diff --git a/lmdeploy/pytorch/engine/executor/uni_executor.py b/lmdeploy/pytorch/engine/executor/uni_executor.py index 423ea144a1..34c7412ee6 100644 --- a/lmdeploy/pytorch/engine/executor/uni_executor.py +++ b/lmdeploy/pytorch/engine/executor/uni_executor.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. import asyncio -from typing import Dict, List from lmdeploy.pytorch.config import BackendConfig, CacheConfig, DistConfig, MiscConfig, ModelConfig, SpecDecodeConfig from lmdeploy.pytorch.devices import DeviceContext @@ -24,7 +23,7 @@ def __init__( cache_config: CacheConfig, backend_config: BackendConfig, misc_config: MiscConfig, - adapters: Dict[str, str] = None, + adapters: dict[str, str] = None, device_type: str = 'cuda', specdecode_config: SpecDecodeConfig = None, ): @@ -122,7 +121,7 @@ def p2p_initialize(self, init_request: DistServeInitRequest): """ return [self.model_agent.cache_engine.p2p_initialize(init_request)] - def p2p_connect(self, remote_engine_id: str, conn_request: List[DistServeKVTransferEndpointInfo]): + def p2p_connect(self, remote_engine_id: str, conn_request: list[DistServeKVTransferEndpointInfo]): """rdma_connect.""" self.model_agent.cache_engine.p2p_connect(remote_engine_id, conn_request) diff --git a/lmdeploy/pytorch/engine/guided_process.py b/lmdeploy/pytorch/engine/guided_process.py index c1bf7c920f..506ebc74a9 100644 --- a/lmdeploy/pytorch/engine/guided_process.py +++ b/lmdeploy/pytorch/engine/guided_process.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import json import logging -from typing import Any, Dict, List, Optional, Tuple +from typing import Any import torch import xgrammar as xgr @@ -13,7 +13,7 @@ class GuidedDecodingManager: processors = {} - def __init__(self, tokenizer: PreTrainedTokenizerBase, vocab_size: Optional[int]): + def __init__(self, tokenizer: PreTrainedTokenizerBase, vocab_size: int | None): if vocab_size is None: vocab_size = tokenizer.vocab_size @@ -21,15 +21,15 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, vocab_size: Optional[int] self.compiler = xgr.GrammarCompiler(tokenizer_info) self.vocab_size = vocab_size - def get_processors(self, session_ctx: List[Dict[str, Any]], - response_formats: Tuple[Dict]) -> Dict[int, xgr.GrammarMatcher]: + def get_processors(self, session_ctx: list[dict[str, Any]], + response_formats: tuple[dict]) -> dict[int, xgr.GrammarMatcher]: processors = {} for i, _format in enumerate(response_formats): - if isinstance(_format, Dict) and _format.get('type', 'text') != 'text': + if isinstance(_format, dict) and _format.get('type', 'text') != 'text': schema_type = _format['type'] if schema_type == 'json_schema': schema = _format['json_schema'] - if isinstance(schema, Dict): + if isinstance(schema, dict): for key in ['json_schema', 'schema']: if key in schema: schema = json.dumps(schema[key], ensure_ascii=False) diff --git a/lmdeploy/pytorch/engine/input_process.py b/lmdeploy/pytorch/engine/input_process.py index 38e4c24e8b..7c54319802 100644 --- a/lmdeploy/pytorch/engine/input_process.py +++ b/lmdeploy/pytorch/engine/input_process.py @@ -1,21 +1,21 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Any, Dict, List, Optional +from typing import Any from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs -TypeModelMetas = Dict[str, Any] +TypeModelMetas = dict[str, Any] -InputMultiModalType = List[Dict[str, Any]] +InputMultiModalType = list[dict[str, Any]] @dataclass class PreprocessInputResult: """Results of preprocess input.""" - input_ids: List[int] - input_multimodals: Optional[MultiModalInputs] = None - model_metas: Optional[TypeModelMetas] = None + input_ids: list[int] + input_multimodals: MultiModalInputs | None = None + model_metas: TypeModelMetas | None = None class BaseModelInputProcessor(ABC): @@ -23,7 +23,7 @@ class BaseModelInputProcessor(ABC): @abstractmethod def preprocess_input(self, - input_ids: List[int], + input_ids: list[int], input_mms: InputMultiModalType = None, **kwargs) -> PreprocessInputResult: """Preprocess input.""" @@ -34,7 +34,7 @@ class DefaultModelInputProcessor(BaseModelInputProcessor): """Default model input processor.""" def preprocess_input(self, - input_ids: List[int], + input_ids: list[int], input_mms: MultiModalInputs = None, **kwargs) -> PreprocessInputResult: """Preprocess input.""" diff --git a/lmdeploy/pytorch/engine/inputs_maker.py b/lmdeploy/pytorch/engine/inputs_maker.py index 506a372250..13cdc799f4 100644 --- a/lmdeploy/pytorch/engine/inputs_maker.py +++ b/lmdeploy/pytorch/engine/inputs_maker.py @@ -2,7 +2,7 @@ import logging from collections import defaultdict from dataclasses import dataclass -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, Optional import numpy as np import torch @@ -131,7 +131,7 @@ def next_chunk_size(self): start = seq.num_history_ids end = start + llm_chunk_size - out_multimodals: 'MultiModalInputs' = defaultdict(list) + out_multimodals: MultiModalInputs = defaultdict(list) for modal_type, mm in self.multimodal_iter(): assert mm.start >= start, 'multimodal data should be sorted by start' if mm.start >= end: @@ -158,7 +158,7 @@ def is_last_chunk(self): def clear(self): """Clear.""" - self.seq: 'SchedulerSequence' = None + self.seq: SchedulerSequence = None self.multimodals: MultiModalInputs = defaultdict(list) self.next_step: int = 0 self.max_prefill_num: int = self.max_prefill_token_num @@ -219,8 +219,8 @@ def __init__( # running seqs # mark the seqs that have been sent to executor - self.running_seqs: List['SchedulerSequence'] = [] - self.to_evict_seqs: List['SchedulerSequence'] = [] + self.running_seqs: list[SchedulerSequence] = [] + self.to_evict_seqs: list[SchedulerSequence] = [] # long context chunker self.long_context_chunker = LongContextChunker(config.max_prefill_token_num) @@ -453,8 +453,8 @@ def create_model_inputs_delta(self): valid_mask = np.array(valid_mask) indices_cpu = np.arange(0, batch_size)[valid_mask] - valid_seqs: List['SchedulerSequence'] = [self.running_seqs[i] for i in indices_cpu] - invalid_seqs: List['SchedulerSequence'] = [self.running_seqs[i] for i in range(batch_size) if not valid_mask[i]] + valid_seqs: list[SchedulerSequence] = [self.running_seqs[i] for i in indices_cpu] + invalid_seqs: list[SchedulerSequence] = [self.running_seqs[i] for i in range(batch_size) if not valid_mask[i]] if len(valid_seqs) == 0: return None, valid_seqs, invalid_seqs @@ -498,8 +498,8 @@ def create_model_inputs_delta_valid_only(self): valid_mask = np.array(valid_mask, dtype=bool) indices_cpu = np.arange(0, batch_size)[valid_mask] - valid_seqs: List['SchedulerSequence'] = [self.running_seqs[i] for i in indices_cpu] - invalid_seqs: List['SchedulerSequence'] = [self.running_seqs[i] for i in range(batch_size) if not valid_mask[i]] + valid_seqs: list[SchedulerSequence] = [self.running_seqs[i] for i in indices_cpu] + invalid_seqs: list[SchedulerSequence] = [self.running_seqs[i] for i in range(batch_size) if not valid_mask[i]] num_decode_tokens = self.engine_strategy.get_num_decode_tokens() max_q_seqlen = num_decode_tokens @@ -523,7 +523,7 @@ def create_model_inputs_delta_valid_only(self): return output, valid_seqs, invalid_seqs - def update_running_seqs(self, running: 'SeqList', inputs: Optional[ModelInputs]): + def update_running_seqs(self, running: 'SeqList', inputs: ModelInputs | None): """Update running seqs.""" if self.config.role == EngineRole.Prefill: # p node will not update running seqs diff --git a/lmdeploy/pytorch/engine/model_agent/__init__.py b/lmdeploy/pytorch/engine/model_agent/__init__.py index 7cbf3fb33d..083e6a1fe4 100644 --- a/lmdeploy/pytorch/engine/model_agent/__init__.py +++ b/lmdeploy/pytorch/engine/model_agent/__init__.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict from lmdeploy.pytorch.config import BackendConfig, CacheConfig, MiscConfig, ModelConfig, SpecDecodeConfig from lmdeploy.pytorch.devices import DeviceContext, get_device_manager @@ -16,7 +15,7 @@ def build_model_agent( misc_config: MiscConfig, dist_ctx: DistContext = None, device_ctx: DeviceContext = None, - adapters: Dict[str, str] = None, + adapters: dict[str, str] = None, specdecode_config: SpecDecodeConfig = None, ): """Create model agent. @@ -26,7 +25,7 @@ def build_model_agent( cache_config (CacheConfig): config of kv cache backend_config (BackendConfig): config of backend devices trust_remote_code (bool): To use the remote modeling code or not - adapters (Dict): lora adapters + adapters (dict): lora adapters tp (int): the number of devices to be used in tensor parallelism dtype (str): the data type of model weights and activations custom_module_map (str): customized nn module map diff --git a/lmdeploy/pytorch/engine/model_agent/agent.py b/lmdeploy/pytorch/engine/model_agent/agent.py index f9c2919962..eb31ff379c 100644 --- a/lmdeploy/pytorch/engine/model_agent/agent.py +++ b/lmdeploy/pytorch/engine/model_agent/agent.py @@ -5,7 +5,7 @@ from dataclasses import dataclass, field, fields from multiprocessing.reduction import ForkingPickler from os import getenv -from typing import Any, Dict, List, Optional +from typing import Any import numpy as np import pybase64 @@ -75,13 +75,13 @@ def to_tensor(self): class BatchedOutputs: next_token_ids: torch.Tensor stopped: torch.Tensor - stop_pos: Optional[torch.Tensor] = None - logits: Optional[torch.Tensor] = None - model_metas: List[Dict[str, Any]] = None - logprobs: Optional[BatchedLogProbs] = None + stop_pos: torch.Tensor | None = None + logits: torch.Tensor | None = None + model_metas: list[dict[str, Any]] = None + logprobs: BatchedLogProbs | None = None new_token_timestamp: int = 0 - extra_outputs: Optional[ExtraOutputs] = None - all_routed_experts: Optional[torch.Tensor] = None + extra_outputs: ExtraOutputs | None = None + all_routed_experts: torch.Tensor | None = None def to_cpu(self): """To cpu.""" @@ -176,7 +176,7 @@ def model_forward( context=context, ) output = model(**input_dict) - if not isinstance(output, Dict): + if not isinstance(output, dict): output = dict(hidden_states=output) # InternVL-3.5-Flash will change the seqlen, model_metas during forward if getattr(context, 'is_model_meta_updated', False): @@ -220,7 +220,7 @@ async def async_wait(self, timeout: float = 0.001): return self.all_vals -SwapMap = Dict[int, int] +SwapMap = dict[int, int] @dataclass @@ -330,7 +330,7 @@ def __init__( misc_config: MiscConfig, dist_ctx: DistContext, device_ctx: DeviceContext, - adapters: Dict[str, str] = None, + adapters: dict[str, str] = None, specdecode_config: SpecDecodeConfig = None, ): @@ -413,7 +413,7 @@ def __init__( self.step_inputs = StepInputs() # long context - self._prev_chunk_output: Dict = None + self._prev_chunk_output: dict = None @contextmanager def all_context(self): @@ -742,8 +742,8 @@ async def _async_step( self, inputs: ModelInputs, delta: ModelInputsDelta = None, - swap_in_map: Dict = None, - swap_out_map: Dict = None, + swap_in_map: dict = None, + swap_out_map: dict = None, sampling_inputs: SamplingInputs = None, stopping_criteria: StoppingCriteria = None, return_logits: bool = False, @@ -1130,7 +1130,7 @@ async def async_forward(self, inputs: ModelInputs): """Model forward. Args: - inputs (Dict): The input data comes from _make_inputs. + inputs (dict): The input data comes from _make_inputs. swap_in_map (SwapMap): Cache maps to swap in. swap_out_map (SwapMap): Cache maps to swap out. """ @@ -1173,7 +1173,7 @@ def _construct(item): model = self.patched_model.get_model() weights = ForkingPickler.loads(pybase64.b64decode(serialized_data)) if request.load_format == 'flattened_bucket': - metadata: List[FlattenedTensorMetadata] = weights['metadata'] + metadata: list[FlattenedTensorMetadata] = weights['metadata'] if metadata: flattened_tensor: torch.Tensor = _construct(weights['flattened_tensor']) bucket = FlattenedTensorBucket(flattened_tensor=flattened_tensor, metadata=metadata) @@ -1210,7 +1210,7 @@ async def sleep(self, level: int = 1): self.state.to_sleep.clear() @torch.inference_mode() - def wakeup(self, tags: Optional[List[str]] = None): + def wakeup(self, tags: list[str] | None = None): """Wakeup.""" if tags is None: tags = ['weights', 'kv_cache'] diff --git a/lmdeploy/pytorch/engine/mp_engine/base.py b/lmdeploy/pytorch/engine/mp_engine/base.py index 660c65cf92..a5c16dd967 100644 --- a/lmdeploy/pytorch/engine/mp_engine/base.py +++ b/lmdeploy/pytorch/engine/mp_engine/base.py @@ -2,11 +2,14 @@ import asyncio from collections import defaultdict from dataclasses import dataclass, field -from typing import Any, List, Optional +from typing import Any from lmdeploy.messages import ResponseType -from lmdeploy.pytorch.disagg.conn.protocol import (DistServeConnectionRequest, DistServeDropConnectionRequest, - DistServeInitRequest) +from lmdeploy.pytorch.disagg.conn.protocol import ( + DistServeConnectionRequest, + DistServeDropConnectionRequest, + DistServeInitRequest, +) from lmdeploy.utils import get_logger from ..base import EngineBase, EngineInstanceBase @@ -54,7 +57,7 @@ def sleep(self, level: int): """sleep.""" return self._collective_rpc('sleep', level) - def wakeup(self, tags: Optional[List[str]] = None): + def wakeup(self, tags: list[str] | None = None): """Wakeup.""" return self._collective_rpc('wakeup', tags) diff --git a/lmdeploy/pytorch/engine/mp_engine/base_worker.py b/lmdeploy/pytorch/engine/mp_engine/base_worker.py index 58bffa825b..0e0fa0fa82 100644 --- a/lmdeploy/pytorch/engine/mp_engine/base_worker.py +++ b/lmdeploy/pytorch/engine/mp_engine/base_worker.py @@ -1,11 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. import asyncio from contextlib import asynccontextmanager -from typing import TYPE_CHECKING, Any, List, Optional +from typing import TYPE_CHECKING, Any from lmdeploy.messages import EngineOutput -from lmdeploy.pytorch.disagg.conn.protocol import (DistServeConnectionRequest, DistServeDropConnectionRequest, - DistServeInitRequest) +from lmdeploy.pytorch.disagg.conn.protocol import ( + DistServeConnectionRequest, + DistServeDropConnectionRequest, + DistServeInitRequest, +) from lmdeploy.utils import get_logger logger = get_logger('lmdeploy') @@ -101,7 +104,7 @@ def sleep(self, level: int = 1): """sleep.""" return self.engine.sleep(level) - def wakeup(self, tags: Optional[List[str]] = None): + def wakeup(self, tags: list[str] | None = None): """Wakeup.""" return self.engine.wakeup(tags) diff --git a/lmdeploy/pytorch/engine/mp_engine/ray_engine.py b/lmdeploy/pytorch/engine/mp_engine/ray_engine.py index 8d8d19008d..3c14d3fd0c 100644 --- a/lmdeploy/pytorch/engine/mp_engine/ray_engine.py +++ b/lmdeploy/pytorch/engine/mp_engine/ray_engine.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. import asyncio -from typing import Dict import ray from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -84,10 +83,10 @@ async def get_stream_task_result(self, stream_id: int): return result, stopped -def _update_runtime_envs(runtime_env: Dict): +def _update_runtime_envs(runtime_env: dict): """Update runtime envs.""" new_envs = _envs.get_all_envs() - env_vars: Dict = runtime_env.get('env_vars', {}) + env_vars: dict = runtime_env.get('env_vars', {}) env_vars.update(new_envs) runtime_env['env_vars'] = env_vars return runtime_env diff --git a/lmdeploy/pytorch/engine/mp_engine/zmq_rpc.py b/lmdeploy/pytorch/engine/mp_engine/zmq_rpc.py index c958235e84..5d43f5ccf5 100644 --- a/lmdeploy/pytorch/engine/mp_engine/zmq_rpc.py +++ b/lmdeploy/pytorch/engine/mp_engine/zmq_rpc.py @@ -2,7 +2,7 @@ import asyncio import inspect import pickle -from typing import Callable, Dict +from collections.abc import Callable from uuid import uuid4 import zmq @@ -39,7 +39,7 @@ def __init__(self): self.context = zmq.Context() self.socket = self.context.socket(zmq.ROUTER) self.port = self.socket.bind_to_random_port(address) - self.methods: Dict[str, Callable] = {} + self.methods: dict[str, Callable] = {} self.running = False # streaming @@ -74,7 +74,7 @@ def send_multipart(self, client_id: bytes, data: bytes): except zmq.ZMQError as e: logger.error(f'Failed to send message to client[{client_id}]: {e}') - def call_method_default(self, client_id, method: Callable, request: Dict): + def call_method_default(self, client_id, method: Callable, request: dict): request_id = request.get('request_id') args = request.get('args', []) kwargs = request.get('kwargs', {}) @@ -85,7 +85,7 @@ def call_method_default(self, client_id, method: Callable, request: Dict): response = dict(success=False, request_id=request_id, error=str(e)) self.send_multipart(client_id, response) - async def _method_async_task(self, client_id, request_id, method: Callable, args: tuple, kwargs: Dict): + async def _method_async_task(self, client_id, request_id, method: Callable, args: tuple, kwargs: dict): """Call method in a task.""" try: result = await method(*args, **kwargs) @@ -95,7 +95,7 @@ async def _method_async_task(self, client_id, request_id, method: Callable, args self.send_multipart(client_id, response) async def _method_async_streaming_task(self, stream_id: int, request_id: int, client_id: int, method: Callable, - args: tuple, kwargs: Dict): + args: tuple, kwargs: dict): """Call method in a task for streaming.""" def __send_resp(): @@ -141,7 +141,7 @@ async def get_stream_output(self, stream_id: int): raise stream_out['error'] return result, stopped - async def call_method_async(self, client_id, method: Callable, request: Dict): + async def call_method_async(self, client_id, method: Callable, request: dict): """Call method async.""" request_id = request.get('request_id') method_name = request.get('method') @@ -237,7 +237,7 @@ def __init__(self, port: int = 5555): self._listen_task = None self.running = False - def _set_reply_default(self, request_id: int, reply: Dict): + def _set_reply_default(self, request_id: int, reply: dict): """Default reply handler for sync socket.""" logger.debug(f'recv reply request_id: {request_id}') future: asyncio.Future = self.pending.pop(request_id) @@ -249,7 +249,7 @@ def _set_reply_default(self, request_id: int, reply: Dict): except Exception as e: logger.debug(f'Set future failed with exception: {e}') - def _set_reply(self, reply: Dict): + def _set_reply(self, reply: dict): request_id = reply['request_id'] self._set_reply_default(request_id, reply) diff --git a/lmdeploy/pytorch/engine/request.py b/lmdeploy/pytorch/engine/request.py index 1a8dd2dd9c..86e37fa28c 100644 --- a/lmdeploy/pytorch/engine/request.py +++ b/lmdeploy/pytorch/engine/request.py @@ -2,8 +2,9 @@ import asyncio import enum import logging +from collections.abc import Awaitable, Callable, Coroutine from dataclasses import dataclass, field -from typing import Any, Awaitable, Callable, Coroutine, Dict, List +from typing import Any from lmdeploy.messages import RequestMetrics, ResponseType from lmdeploy.utils import get_logger @@ -45,7 +46,7 @@ class Request: resp: Response = None -ReqList = List[Request] +ReqList = list[Request] def _run_until_complete(future: Awaitable): @@ -69,7 +70,7 @@ class RequestSender: """ sender_id: int manager: 'RequestManager' - resp_dict: Dict[int, List[Response]] = field(default_factory=dict) + resp_dict: dict[int, list[Response]] = field(default_factory=dict) @classmethod def new(cls, sender_id: int, manager: 'RequestManager'): @@ -99,7 +100,7 @@ def _req_put(self, reqs: Any): """Async rq_que put.""" self.req_que.put_nowait(reqs) - def _gather_request(self, req_types: List[RequestType], data: List[Any]): + def _gather_request(self, req_types: list[RequestType], data: list[Any]): """Gather requests.""" if self.manager._loop_task is None: self.manager.create_loop_task() @@ -119,7 +120,7 @@ def _gather_request(self, req_types: List[RequestType], data: List[Any]): reqs.append(req) return resps, reqs - def batched_send_async(self, req_types: List[RequestType], data: List[Any]): + def batched_send_async(self, req_types: list[RequestType], data: list[Any]): """Batched send request asynchronize.""" resps, reqs = self._gather_request(req_types, data) self._req_put(reqs) @@ -166,9 +167,9 @@ class RequestManager: """Request manager.""" def __init__(self): - self.senders: Dict[int, RequestSender] = dict() - self.callbacks: Dict[RequestType, Callable] = dict() - self.request_priority: List[RequestType] = [ + self.senders: dict[int, RequestSender] = dict() + self.callbacks: dict[RequestType, Callable] = dict() + self.request_priority: list[RequestType] = [ RequestType.STOP_ENGINE, RequestType.ADD_SESSION, RequestType.STOP_SESSION, RequestType.END_SESSION, RequestType.ADD_MESSAGE ] @@ -293,7 +294,7 @@ def has_requests(self): return False return not self.requests.empty() - async def get_all_requests(self) -> Dict[RequestType, List[Request]]: + async def get_all_requests(self) -> dict[RequestType, list[Request]]: """Get all requests in current queue.""" num_reqs = self.requests.qsize() reqs: ReqList = [] @@ -315,7 +316,7 @@ def __proc_reqs(elem): __proc_reqs(elem) # gather requests - reqs_by_type: Dict[RequestType, List[Request]] = dict((t, []) for t in RequestType) + reqs_by_type: dict[RequestType, list[Request]] = dict((t, []) for t in RequestType) for req in reqs: reqs_by_type[req.type].append(req) return reqs_by_type @@ -324,7 +325,7 @@ def bind_func(self, req_type: RequestType, callback: Callable): """Bind handler for given request type.""" self.callbacks[req_type] = callback - def set_request_priority(self, priority: List[RequestType]): + def set_request_priority(self, priority: list[RequestType]): """Set the priority of request type.""" self.request_priority = priority diff --git a/lmdeploy/pytorch/envs.py b/lmdeploy/pytorch/envs.py index 0e44e19ac1..e38892f845 100644 --- a/lmdeploy/pytorch/envs.py +++ b/lmdeploy/pytorch/envs.py @@ -1,15 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. import contextlib import os -from typing import Union def env_to_bool( env_var: str, default: bool = False, *, - true_values: Union[set, list] = {'true', '1', 'yes', 'on'}, - false_values: Union[set, list] = {'false', '0', 'no', 'off'}, + true_values: set | list = {'true', '1', 'yes', 'on'}, + false_values: set | list = {'false', '0', 'no', 'off'}, ): """Env to bool.""" value = os.getenv(env_var) @@ -80,7 +79,7 @@ def set_envs(): def _patched_get_env( env_var: str, - default: Union[str, None] = None, + default: str | None = None, ): """Patched get_env.""" if env_var in os.environ: diff --git a/lmdeploy/pytorch/kernels/__init__.py b/lmdeploy/pytorch/kernels/__init__.py index 28897648ed..ae4a278777 100644 --- a/lmdeploy/pytorch/kernels/__init__.py +++ b/lmdeploy/pytorch/kernels/__init__.py @@ -1,7 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. -from .w8a8_triton_kernels import (matmul_kernel_dynamic_quant, per_channel_quant, per_token_quant_int8, - rms_norm_dynamic_quant) +from .w8a8_triton_kernels import ( + matmul_kernel_dynamic_quant, + per_channel_quant, + per_token_quant_int8, + rms_norm_dynamic_quant, +) __all__ = [ 'matmul_kernel_dynamic_quant', diff --git a/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py index ee0765257f..6ed063ef60 100644 --- a/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py +++ b/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py @@ -129,7 +129,7 @@ def apply_rotary_pos_emb(q: Tensor, k_embed (Tensor): output k, can be same as k Returns: - Tuple[Tensor, Tensor]: Embedded query and key. + tuple[Tensor, Tensor]: Embedded query and key. """ if cos.device != q.device: cos = cos.to(device=q.device) diff --git a/lmdeploy/pytorch/kernels/cuda/blocked_fp8_fused_moe.py b/lmdeploy/pytorch/kernels/cuda/blocked_fp8_fused_moe.py index ad5804a4ac..b504234ba5 100644 --- a/lmdeploy/pytorch/kernels/cuda/blocked_fp8_fused_moe.py +++ b/lmdeploy/pytorch/kernels/cuda/blocked_fp8_fused_moe.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. # modify from: https://github.com/vllm-project/vllm -from typing import Callable +from collections.abc import Callable import torch import triton diff --git a/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py b/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py index 1c826f097e..e6717b5ef4 100644 --- a/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py +++ b/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional import torch import triton @@ -102,7 +101,7 @@ def _quant_fp8_kernel( s_ptr += m_id_stride * stride_sm -def _quant_fp8_launcher(A: Tensor, group_size: int, out: Tensor, scales: Tensor, scale_fmt: Optional[str] = None): +def _quant_fp8_launcher(A: Tensor, group_size: int, out: Tensor, scales: Tensor, scale_fmt: str | None = None): """Quant online.""" assert scale_fmt in (None, 'ue8m0') round_scale = 1 if scale_fmt == 'ue8m0' else 0 @@ -160,7 +159,7 @@ def quant_fp8(A: Tensor, group_size: int, dtype: torch.dtype = torch.float8_e4m3fn, trans_scale: bool = False, - scale_fmt: Optional[str] = None): + scale_fmt: str | None = None): """Quant fp8.""" assert A.dim() == 2 M, K = A.shape @@ -177,7 +176,7 @@ def quant_fp8(A: Tensor, def quant_fp8_tma(A: Tensor, group_size: int, dtype: torch.dtype = torch.float8_e4m3fn, - scale_fmt: Optional[str] = None): + scale_fmt: str | None = None): """Quant fp8 tma.""" from lmdeploy.pytorch.third_party.deep_gemm import ceil_div, get_m_alignment_for_contiguous_layout assert A.dim() == 2 diff --git a/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py b/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py index d03d3ddaf7..2f56a23f45 100644 --- a/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py +++ b/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Literal, Optional +from typing import Literal import torch import triton @@ -399,9 +399,9 @@ def _fill_kv_cache_quant_kernel( def fill_kv_cache(k_states: Tensor, - v_states: Optional[Tensor], + v_states: Tensor | None, k_caches: Tensor, - v_caches: Optional[Tensor], + v_caches: Tensor | None, q_start_loc: Tensor, q_seq_length: Tensor, kv_seq_length: Tensor, @@ -690,32 +690,32 @@ def _fill_kv_cache_blocked_fp8_kernel( def fill_kv_cache_blocked_fp8(k_states: Tensor, - v_states: Optional[Tensor], + v_states: Tensor | None, k_caches: Tensor, - v_caches: Optional[Tensor], + v_caches: Tensor | None, ks_caches: Tensor, - vs_caches: Optional[Tensor], + vs_caches: Tensor | None, cu_seqlen_q: Tensor, kv_seqlens: Tensor, max_q_seqlen: int, block_offsets: Tensor, group_size: int = 128, kv_layout: str = 'bshd', - scale_fmt: Optional[str] = None): + scale_fmt: str | None = None): """Fill key/value state to cache for paged attention with fp8 quantization. Args: k_states (Tensor): Key states of shape (seq_length, num_heads, head_dim). - v_states (Optional[Tensor]): Value states of shape + v_states (Tensor | None): Value states of shape (seq_length, num_heads, head_dim_v). If None, no value states are processed. k_caches (Tensor): 4D k cache, shape depends on ``kv_layout``. - v_caches (Optional[Tensor]): 4D v cache, shape depends on + v_caches (Tensor | None): 4D v cache, shape depends on ``kv_layout``. If None, no value caches are processed. ks_caches (Tensor): 4D k scale cache, shape depends on ``kv_layout``. - vs_caches (Optional[Tensor]): 4D v scale cache, shape depends on + vs_caches (Tensor | None): 4D v scale cache, shape depends on ``kv_layout``. If None, no value scale caches are processed. cu_seqlen_q (Tensor): Cumulative sequence lengths of queries, shape (batch_size + 1, ). diff --git a/lmdeploy/pytorch/kernels/cuda/flashattention.py b/lmdeploy/pytorch/kernels/cuda/flashattention.py index a01d151f9c..765c80aa1b 100644 --- a/lmdeploy/pytorch/kernels/cuda/flashattention.py +++ b/lmdeploy/pytorch/kernels/cuda/flashattention.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import math -from typing import Sequence +from collections.abc import Sequence import torch import triton @@ -89,7 +89,7 @@ def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs, loop_start qk = tl.where( qk_mask, qk, - float(-1e30), + (-1e30), ) m_i_new = tl.maximum(m_i, tl.max(qk, 1)) qk -= m_i_new[:, None] @@ -101,7 +101,7 @@ def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs, loop_start qk = tl.where( qk_mask, qk, - float(-1e30), + (-1e30), ) m_i_new = tl.maximum(m_i, tl.max(qk, 1)) qk -= m_i_new[:, None] diff --git a/lmdeploy/pytorch/kernels/cuda/fused_moe.py b/lmdeploy/pytorch/kernels/cuda/fused_moe.py index fc19fc6a97..d99c398071 100644 --- a/lmdeploy/pytorch/kernels/cuda/fused_moe.py +++ b/lmdeploy/pytorch/kernels/cuda/fused_moe.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. # modify from: https://github.com/vllm-project/vllm -from typing import Callable +from collections.abc import Callable import torch import triton @@ -353,7 +353,7 @@ def get_start_end(exp_cum: torch.Tensor, exp_topk: torch.Tensor, topk: int): exp_start = start_end[0, :] exp_end = start_end[1, :] - out = exp_cum.new_empty((num_tokens * topk)) + out = exp_cum.new_empty(num_tokens * topk) num_warps = 1 diff --git a/lmdeploy/pytorch/kernels/cuda/fused_moe_ep.py b/lmdeploy/pytorch/kernels/cuda/fused_moe_ep.py index 6c0c792906..37cb8dcc13 100644 --- a/lmdeploy/pytorch/kernels/cuda/fused_moe_ep.py +++ b/lmdeploy/pytorch/kernels/cuda/fused_moe_ep.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. # modify from dlblas: https://github.com/DeepLink-org/DLBlas -from typing import List, Optional import torch import triton @@ -221,7 +220,7 @@ def fused_moe_v3( topk_weights, w13_weight: torch.Tensor, w2_weight: torch.Tensor, - num_recv_tokens_per_expert: Optional[List[int]], + num_recv_tokens_per_expert: list[int] | None, ): if num_recv_tokens_per_expert is None: return hidden_states diff --git a/lmdeploy/pytorch/kernels/cuda/gated_delta_rule.py b/lmdeploy/pytorch/kernels/cuda/gated_delta_rule.py index 9dc75bd61b..f3f41877b1 100644 --- a/lmdeploy/pytorch/kernels/cuda/gated_delta_rule.py +++ b/lmdeploy/pytorch/kernels/cuda/gated_delta_rule.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Sequence +from collections.abc import Sequence import tilelang import tilelang.language as T diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py index aac072ab86..96a38425af 100644 --- a/lmdeploy/pytorch/kernels/cuda/pagedattention.py +++ b/lmdeploy/pytorch/kernels/cuda/pagedattention.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. # modify from: https://github.com/ModelTC/lightllm import math -from typing import Literal, Sequence +from collections.abc import Sequence +from typing import Literal import torch import triton diff --git a/lmdeploy/pytorch/kernels/cuda/rms_norm.py b/lmdeploy/pytorch/kernels/cuda/rms_norm.py index 81b05138df..5238bab261 100644 --- a/lmdeploy/pytorch/kernels/cuda/rms_norm.py +++ b/lmdeploy/pytorch/kernels/cuda/rms_norm.py @@ -191,8 +191,8 @@ def test_rms_norm(bsz, ctx_len, feat_len, dtype): torch_cost = (t1 - t0) / N_REPEATS * 1000 triton_cost = (t2 - t1) / N_REPEATS * 1000 - print('input {} weight {} dtype {}\n torch {:.3f} triton {:.3f} (ms)\n'.format( - input.shape, weight.shape, dtype, torch_cost, triton_cost)) + print(f'input {input.shape} weight {weight.shape} dtype {dtype}\n' \ + f' torch {torch_cost:.3f} triton {triton_cost:.3f} (ms)\n') test_rms_norm(1, 8128, 5120, torch.float16) test_rms_norm(1, 8128, 5120, torch.float32) diff --git a/lmdeploy/pytorch/kernels/dispatcher.py b/lmdeploy/pytorch/kernels/dispatcher.py index fcf85c913f..77b785c66b 100644 --- a/lmdeploy/pytorch/kernels/dispatcher.py +++ b/lmdeploy/pytorch/kernels/dispatcher.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import importlib import inspect -from typing import Callable +from collections.abc import Callable from lmdeploy.utils import get_logger diff --git a/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py index 4e5fa4a3e8..79ec75dde5 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py +++ b/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional, Tuple import dlinfer.ops as ext_ops from torch import Tensor @@ -10,9 +9,9 @@ def apply_rotary_pos_emb( key_states: Tensor, cos: Tensor, sin: Tensor, - q_embed: Optional[Tensor], - k_embed: Optional[Tensor], -) -> Tuple[Tensor, Tensor]: + q_embed: Tensor | None, + k_embed: Tensor | None, +) -> tuple[Tensor, Tensor]: query_states_embed, key_states_embed = \ ext_ops.apply_rotary_pos_emb(query_states, key_states, diff --git a/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py b/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py index 473e0404c9..c027b64a1e 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py +++ b/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional import dlinfer.ops as ext_ops from torch import Tensor @@ -9,7 +8,7 @@ def awq_linear(x: Tensor, qweight: Tensor, scales: Tensor, qzeros: Tensor, - bias: Optional[Tensor] = None, + bias: Tensor | None = None, all_reduce: bool = False, group_size: int = 0): return ext_ops.weight_quant_matmul(x.squeeze(0), diff --git a/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py b/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py index ec1cdeac94..9602886e9b 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py +++ b/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional, Sequence +from collections.abc import Sequence import dlinfer.ops as ext_ops from torch import Tensor @@ -11,8 +11,8 @@ def fill_kv_cache( key_caches: Tensor, value_caches: Tensor, kv_start_indices: Tensor, - k_scales_zeros: Sequence[Optional[Tensor]], - v_scales_zeros: Sequence[Optional[Tensor]], + k_scales_zeros: Sequence[Tensor | None], + v_scales_zeros: Sequence[Tensor | None], quant_bits: int = 0, ): """Fill key/value state to cache for paged attention.""" diff --git a/lmdeploy/pytorch/kernels/dlinfer/linear.py b/lmdeploy/pytorch/kernels/dlinfer/linear.py index 686c4a8d39..962b221c78 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/linear.py +++ b/lmdeploy/pytorch/kernels/dlinfer/linear.py @@ -1,9 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional import dlinfer.ops as ext_ops from torch import Tensor -def linear(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, all_reduce: bool = False, group: str = ''): +def linear(x: Tensor, weight: Tensor, bias: Tensor | None = None, all_reduce: bool = False, group: str = ''): return ext_ops.linear(x, weight, bias=bias, all_reduce=all_reduce, group=group) diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py index cc1a324bf4..bb8e2705eb 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py +++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Tuple import dlinfer.ops as ext_ops from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetadata @@ -7,6 +6,6 @@ def moe_gating_topk_softmax(router_logits: Tensor, topk: int, - moe_metadata: DlinferMoeMetadata) -> Tuple[Tensor, Tensor]: + moe_metadata: DlinferMoeMetadata) -> tuple[Tensor, Tensor]: routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, moe_metadata) return routing_weights, selected_experts diff --git a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py index 8996508aff..6916d8a082 100644 --- a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py +++ b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional, Sequence +from collections.abc import Sequence import dlinfer.ops as ext_ops from torch import Tensor @@ -23,12 +23,12 @@ def prefill_attention( num_q_heads: int, num_kv_heads: int, head_size_v: int, - attn_mask: Sequence[Optional[Tensor]], - softmax_scale: Optional[float], - is_unpaged_prefill: Optional[bool], - kv_scales: Optional[Tensor], - kv_zeros: Optional[Tensor], - quant_bits: Optional[int], + attn_mask: Sequence[Tensor | None], + softmax_scale: float | None, + is_unpaged_prefill: bool | None, + kv_scales: Tensor | None, + kv_zeros: Tensor | None, + quant_bits: int | None, ) -> Tensor: if is_unpaged_prefill: return ext_ops.prefill_attention( @@ -86,10 +86,10 @@ def paged_token_attention( num_q_heads, num_kv_heads, head_size_v, - softmax_scale: Optional[float], - kv_scales: Optional[Tensor], - kv_zeros: Optional[Tensor], - quant_bits: Optional[int], + softmax_scale: float | None, + kv_scales: Tensor | None, + kv_zeros: Tensor | None, + quant_bits: int | None, ): return ext_ops.paged_decode_attention( q, @@ -129,12 +129,12 @@ def paged_attention_fwd( num_heads: int, num_kv_heads: int, v_head_size: int, - attn_mask: Sequence[Optional[Tensor]] = (), - softmax_scale: Optional[float] = None, - is_unpaged_prefill: Optional[bool] = None, - kv_scales: Optional[Tensor] = None, - kv_zeros: Optional[Tensor] = None, - quant_bits: Optional[int] = 0, + attn_mask: Sequence[Tensor | None] = (), + softmax_scale: float | None = None, + is_unpaged_prefill: bool | None = None, + kv_scales: Tensor | None = None, + kv_zeros: Tensor | None = None, + quant_bits: int | None = 0, ): if not is_decoding: return prefill_attention( diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py index 588f458bb6..a9661f960c 100644 --- a/lmdeploy/pytorch/messages.py +++ b/lmdeploy/pytorch/messages.py @@ -2,7 +2,7 @@ import enum from collections import defaultdict from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Dict, List +from typing import TYPE_CHECKING, Any import numpy as np import torch @@ -24,8 +24,8 @@ logger = get_logger('lmdeploy') # vlm input type from pipeline -InputEmbeddingType = List[np.ndarray] -InputEmbeddingRangeType = List[List[int]] +InputEmbeddingType = list[np.ndarray] +InputEmbeddingRangeType = list[list[int]] @dataclass @@ -52,12 +52,12 @@ class SamplingParam: repetition_penalty: float = 1.0 ignore_eos: bool = False random_seed: int = None - stop_words: List[int] = field(default_factory=list) - bad_words: List[int] = field(default_factory=list) + stop_words: list[int] = field(default_factory=list) + bad_words: list[int] = field(default_factory=list) max_new_tokens: int = 512 min_new_tokens: int = 0 response_format: None | str = None - logits_processors: None | List[LogitsProcessor] = None + logits_processors: None | list[LogitsProcessor] = None out_logits: bool = False out_last_hidden_states: bool = False num_logprobs: int = -1 @@ -173,7 +173,7 @@ class MessageStatus(enum.Enum): MIGRATION_DONE = enum.auto() -SeqMap = Dict[int, 'SchedulerSequence'] +SeqMap = dict[int, 'SchedulerSequence'] @dataclass @@ -189,7 +189,7 @@ class SequenceManager: def __init__(self, seq_meta: SequenceMeta) -> None: self._seq_map: SeqMap = dict() - self._status_seq_map: Dict[MessageStatus, SeqMap] = defaultdict(dict) + self._status_seq_map: dict[MessageStatus, SeqMap] = defaultdict(dict) self.seq_meta = seq_meta self._seq_count = 0 @@ -267,7 +267,7 @@ def add_sequence(self, sampling_param: SamplingParam = None, adapter_name: str = None, multimodals: MultiModalInputs = None, - input_embeddings: List[InputEmbeddings] = None, + input_embeddings: list[InputEmbeddings] = None, migration_request: None | MigrationRequest = None, resp_cache: bool = False, preserve_cache: bool = False) -> 'SchedulerSequence': @@ -325,12 +325,12 @@ def _round_up(x, n): class HistoryEmbeddings: """History embeddings.""" - def __init__(self, embeddings: List[InputEmbeddings] = None): - self._embeddings: List[InputEmbeddings] = [] + def __init__(self, embeddings: list[InputEmbeddings] = None): + self._embeddings: list[InputEmbeddings] = [] if embeddings is not None: self._embeddings.extend(embeddings) - def append(self, embeddings: List[InputEmbeddings]): + def append(self, embeddings: list[InputEmbeddings]): self._embeddings.extend(embeddings) def clone(self): @@ -607,7 +607,7 @@ class SchedulerSequence: output_start_pos: int = 0 meta: Any = None num_ignored_history: int = 0 - model_meta: Dict[str, Any] = None + model_meta: dict[str, Any] = None # For Disaggregation migration_request: None | MigrationRequest = None @@ -615,7 +615,7 @@ class SchedulerSequence: preserve_cache: bool = False # For logging - engine_events: List[EngineEvent] = field(default_factory=list) + engine_events: list[EngineEvent] = field(default_factory=list) # for router replay all_routed_experts: HistoryRouterExperts = field(default_factory=HistoryRouterExperts) @@ -662,7 +662,7 @@ def token_ids(self) -> np.ndarray: return self.history_cache[start:end] @property - def input_embeddings(self) -> List[InputEmbeddings]: + def input_embeddings(self) -> list[InputEmbeddings]: """Get current embeddings.""" start = self.history_image_num end = start + self._num_images @@ -786,7 +786,7 @@ def record_event( ) -> None: self.engine_events.append(EngineEvent.new_event(event_type, timestamp)) - def _update_embeddings(self, embeddings: List[InputEmbeddings]): + def _update_embeddings(self, embeddings: list[InputEmbeddings]): """Update input embeddings.""" self._num_history_images += self._num_images if embeddings is None: @@ -806,8 +806,8 @@ def _update_multimodals(self, multimodals: MultiModalInputs): def update_token_ids(self, token_ids: Tensor, multimodals: MultiModalInputs = None, - embeddings: List[InputEmbeddings] = None, - model_meta: Dict[str, Any] = None, + embeddings: list[InputEmbeddings] = None, + model_meta: dict[str, Any] = None, mode: UpdateTokenMode = UpdateTokenMode.INPUTS, **kwargs): """Update token ids, old token ids will be added to history.""" diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py index 538b9c6f3a..2ddd0605c2 100644 --- a/lmdeploy/pytorch/model_inputs.py +++ b/lmdeploy/pytorch/model_inputs.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from dataclasses import dataclass, field, fields -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional +from typing import TYPE_CHECKING, Any, Literal import numpy as np import torch @@ -20,11 +20,11 @@ @dataclass class DPMeta: - tp_sizes: List[int] = None - moe_tp_sizes: List[int] = None + tp_sizes: list[int] = None + moe_tp_sizes: list[int] = None @staticmethod - def _gather_tp_sizes(tp: int, seqlen: int, num_tokens: List[int], dist_ctx: dist.DistContext, layer_type: str): + def _gather_tp_sizes(tp: int, seqlen: int, num_tokens: list[int], dist_ctx: dist.DistContext, layer_type: str): """Gather tp size.""" attn_tp = dist_ctx.dist_config.attn_tp if tp > 1 and tp != attn_tp: @@ -38,7 +38,7 @@ def _gather_tp_sizes(tp: int, seqlen: int, num_tokens: List[int], dist_ctx: dist return tp_sizes @classmethod - def build(cls, seqlen: int, num_tokens: List[int]): + def build(cls, seqlen: int, num_tokens: list[int]): """Get dp meta.""" dist_ctx = dist.get_dist_manager().current_context() dist_config = dist_ctx.dist_config @@ -63,10 +63,10 @@ def sync_tp_size(self, tp_size: int): class VisionModelInputs: """Vision model inputs.""" history_lengths: torch.LongTensor = None - input_embeddings: List[List[torch.Tensor]] = None - input_embedding_ranges: List[torch.LongTensor] = None + input_embeddings: list[list[torch.Tensor]] = None + input_embedding_ranges: list[torch.LongTensor] = None input_embedding_indexing: torch.BoolTensor = None - input_multimodals: List[MultiModalTensor] = None + input_multimodals: list[MultiModalTensor] = None def to_device(self, device: str, non_blocking: bool = False): """To device.""" @@ -125,7 +125,7 @@ def get_inputs(self, history_lengths: torch.Tensor, seq_lengths: torch.Tensor): class ModelInputsDelta: """Delta of ModelInputs.""" # valid indices - indices: Optional[torch.Tensor] + indices: torch.Tensor | None # new block offsets block_offsets: torch.Tensor # cpu copy of indices @@ -135,7 +135,7 @@ class ModelInputsDelta: sum_kv_seqlen: int is_decoding: bool = True # sliding window - num_ignored_history: Optional[torch.Tensor] = None + num_ignored_history: torch.Tensor | None = None @property def seq_length(self): @@ -184,7 +184,7 @@ class ModelInputs: sum_kv_seqlen: int local_adapter_ids: torch.Tensor = None vision_inputs: VisionModelInputs = None - model_metas: List[Dict[str, Any]] = None + model_metas: list[dict[str, Any]] = None dp_meta: 'DPMeta' = None enable_microbatch: bool = False is_dummy: bool = False @@ -222,7 +222,7 @@ def to_device(self, device: str, non_blocking: bool = False): return ModelInputs(**out_dict) - def build_dp_meta(self, num_tokens: List[int]): + def build_dp_meta(self, num_tokens: list[int]): """Build dp meta.""" self.dp_meta = DPMeta.build(self.input_ids.numel(), num_tokens) @@ -248,28 +248,28 @@ class StepContext: q_seqlens: torch.LongTensor kv_seqlens: torch.IntTensor q_start_loc: torch.LongTensor - kv_caches: List + kv_caches: list is_decoding: bool sum_kv_seqlen: int max_kv_seqlen: int = None local_adapter_ids: torch.LongTensor = None input_embeddings: torch.Tensor = None input_embedding_indexing: torch.Tensor = None - input_multimodals: List[MultiModalTensor] = None + input_multimodals: list[MultiModalTensor] = None vision_inputs: VisionModelInputs = None attn_metadata: Any = None kv_quant_policy: Literal[0, 4, 8] = 0 - model_metas: List[Dict[str, Any]] = None + model_metas: list[dict[str, Any]] = None dp_meta: DPMeta = None enable_microbatch: bool = False # for draft model target_hidden_states: torch.Tensor = None # states for ssm - state_caches: List = None + state_caches: list = None state_offsets: torch.LongTensor = None - _outputs: Dict = field(default_factory=dict) + _outputs: dict = field(default_factory=dict) @classmethod def new( @@ -277,8 +277,8 @@ def new( inputs: ModelInputs, model_config: ModelConfig, cache_config: CacheConfig, - kv_caches: List = None, - state_caches: List = None, + kv_caches: list = None, + state_caches: list = None, kv_quant_policy: Literal[0, 4, 8] = 0, ): """Build step context. @@ -408,8 +408,8 @@ def build_context( inputs: ModelInputs, model_config: ModelConfig, cache_config: CacheConfig, - kv_caches: List = None, - state_caches: List = None, + kv_caches: list = None, + state_caches: list = None, kv_quant_policy: Literal[0, 4, 8] = 0, ): """Build context.""" diff --git a/lmdeploy/pytorch/models/baichuan.py b/lmdeploy/pytorch/models/baichuan.py index 76aa659385..5ef0aedf59 100644 --- a/lmdeploy/pytorch/models/baichuan.py +++ b/lmdeploy/pytorch/models/baichuan.py @@ -1,14 +1,20 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding -from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_down_linear, + build_gateup_linear, + build_o_proj, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMixin @@ -67,8 +73,8 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -174,9 +180,9 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): """forward.""" @@ -243,10 +249,10 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """forward.""" @@ -316,7 +322,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -341,8 +347,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -368,7 +374,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/chatglm2.py b/lmdeploy/pytorch/models/chatglm2.py index 56e3169bb7..680a7001e6 100644 --- a/lmdeploy/pytorch/models/chatglm2.py +++ b/lmdeploy/pytorch/models/chatglm2.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -10,10 +11,23 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor -from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding, - build_rotary_params) -from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_down_linear, build_gateup_linear, build_o_proj, - build_qkv_proj, build_rowwise_linear) +from lmdeploy.pytorch.nn import ( + ApplyRotaryEmb, + Attention, + RMSNorm, + RopeType, + SiluAndMul, + build_rotary_embedding, + build_rotary_params, +) +from lmdeploy.pytorch.nn.linear import ( + build_colwise_linear, + build_down_linear, + build_gateup_linear, + build_o_proj, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMixin @@ -87,8 +101,8 @@ def _fill_rope(states: torch.Tensor, rope: torch.Tensor): def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -211,9 +225,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -264,8 +278,8 @@ def _get_layer(self, layer_number: int): def forward( self, hidden_states: torch.LongTensor, - rotary_pos_emb: List[torch.Tensor], - past_key_values: Optional[List[torch.FloatTensor]], + rotary_pos_emb: list[torch.Tensor], + past_key_values: list[torch.FloatTensor] | None, attn_metadata: Any, ): """forward.""" @@ -573,12 +587,12 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, images: torch.Tensor = None, image_mask: torch.Tensor = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """forward.""" @@ -633,7 +647,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, images: torch.Tensor = None, image_mask: torch.Tensor = None, @@ -662,8 +676,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -714,8 +728,8 @@ def _get_model_metas(self, context: StepContext): return [dict(num_img_tokens=0) if meta is None else meta for meta in model_metas] def update_model_metas(self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None): """Update model meta.""" model_metas = self._get_model_metas(context) @@ -790,7 +804,7 @@ def update_model_metas(self, return new_model_metas - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm @@ -850,8 +864,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None: self.vision_token_num = self.num_patches // 4 def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: diff --git a/lmdeploy/pytorch/models/cogvlm.py b/lmdeploy/pytorch/models/cogvlm.py index ad8adc9739..f697d791f9 100644 --- a/lmdeploy/pytorch/models/cogvlm.py +++ b/lmdeploy/pytorch/models/cogvlm.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. from argparse import Namespace -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch import torch.nn.functional as F @@ -14,8 +15,12 @@ from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding -from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_colwise_linear, + build_merged_colwise_linear, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMixin @@ -92,8 +97,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, lang_ids: torch.LongTensor = None, vision_ids: torch.LongTensor = None, @@ -262,9 +267,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, lang_ids: torch.LongTensor = None, vision_ids: torch.LongTensor = None, @@ -574,11 +579,11 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, images: torch.Tensor = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, lang_ids: torch.LongTensor = None, vision_ids: torch.LongTensor = None, ): @@ -661,7 +666,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, images: torch.Tensor = None, inputs_embeds: torch.Tensor = None, @@ -692,8 +697,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -746,7 +751,7 @@ def prepare_inputs_for_generation( vision_ids=vis_ids, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ @@ -798,8 +803,8 @@ def _get_model_metas(self, context: StepContext): return [dict(num_img_tokens=0) if meta is None else meta for meta in model_metas] def update_model_metas(self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None): """Update model meta.""" model_metas = self._get_model_metas(context) @@ -887,7 +892,7 @@ def __init__(self, config: PretrainedConfig, dtype) -> None: # cogvlm2 self.vision_token_num = 2 + (image_size // patch_size // 2)**2 - def preprocess_input(self, input_ids: List[int], input_multimodals=None, **kwargs) -> PreprocessInputResult: + def preprocess_input(self, input_ids: list[int], input_multimodals=None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: return input_ids, input_multimodals diff --git a/lmdeploy/pytorch/models/deepseek.py b/lmdeploy/pytorch/models/deepseek.py index 44239b9c98..78b69acadc 100644 --- a/lmdeploy/pytorch/models/deepseek.py +++ b/lmdeploy/pytorch/models/deepseek.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -62,8 +63,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -260,9 +261,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -317,10 +318,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -393,7 +394,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -418,8 +419,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -445,8 +446,8 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - expert_params_mapping: List): + def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + expert_params_mapping: list): """Load weight experts.""" for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping: if weight_name not in name: @@ -459,7 +460,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di param = params_dict[name] load_weight(param, loaded_weight) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/deepseek_mtp.py b/lmdeploy/pytorch/models/deepseek_mtp.py index 1c315f4778..32190d8788 100644 --- a/lmdeploy/pytorch/models/deepseek_mtp.py +++ b/lmdeploy/pytorch/models/deepseek_mtp.py @@ -1,16 +1,29 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn from transformers.configuration_utils import PretrainedConfig from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager -from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding, - build_rotary_params) -from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_down_linear, build_gateup_linear, build_o_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn import ( + ApplyRotaryEmb, + Attention, + RMSNorm, + RopeType, + SiluAndMul, + build_rotary_embedding, + build_rotary_params, +) +from lmdeploy.pytorch.nn.linear import ( + build_colwise_linear, + build_down_linear, + build_gateup_linear, + build_o_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.nn.moe import build_fused_moe from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters, get_rope_theta from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight @@ -161,8 +174,8 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -413,8 +426,8 @@ def forward( input_ids: torch.Tensor, position_ids: torch.Tensor, previous_hidden_states: torch.Tensor, - past_key_value: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_value: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, attn_metadata: Any = None, spec_step_index: int = 0, ) -> torch.Tensor: @@ -477,8 +490,8 @@ def forward( input_ids: torch.Tensor, position_ids: torch.Tensor, previous_hidden_states: torch.Tensor, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, attn_metadata: Any = None, spec_step_idx: int = 0, ) -> torch.Tensor: @@ -541,9 +554,9 @@ def forward( input_ids: torch.Tensor, position_ids: torch.Tensor, target_hidden_states: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, - inputs_embeds: Optional[torch.Tensor] = None, + inputs_embeds: torch.Tensor | None = None, spec_step_idx: int = 0, ) -> torch.Tensor: hidden_states = self.model(input_ids, @@ -582,8 +595,8 @@ def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta, input_ids: torch.Ten def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -600,8 +613,8 @@ def prepare_inputs_for_generation( target_hidden_states=target_hidden_states, ) - def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - expert_params_mapping: List): + def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + expert_params_mapping: list): """Load weight experts.""" for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping: if weight_name not in name: @@ -614,8 +627,8 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di param = params_dict[name] load_weight(param, loaded_weight) - def _load_weight_attention(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - update_pe_mapping: List): + def _load_weight_attention(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + update_pe_mapping: list): """Load weight attention.""" device = next(iter(params_dict.values())).device @@ -707,7 +720,7 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor): param = params_dict[name] load_weight(param, loaded_weight) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" def __skip_nextn(name, nextn_keys): diff --git a/lmdeploy/pytorch/models/deepseek_v2.py b/lmdeploy/pytorch/models/deepseek_v2.py index bab833f2f7..087d744bf9 100644 --- a/lmdeploy/pytorch/models/deepseek_v2.py +++ b/lmdeploy/pytorch/models/deepseek_v2.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. import math +from collections.abc import Iterable from copy import deepcopy from enum import Enum, auto from os import getenv -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any import torch import torch.nn.functional as F @@ -13,11 +14,24 @@ import lmdeploy.pytorch.distributed as dist from lmdeploy.pytorch.distributed import get_dist_manager, get_ep_world_rank, get_tp_world_rank from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager, get_step_ctx_manager -from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, ParallelEmbedding, RMSNorm, RopeType, SiluAndMul, - build_rotary_embedding, build_rotary_params) +from lmdeploy.pytorch.nn import ( + ApplyRotaryEmb, + Attention, + ParallelEmbedding, + RMSNorm, + RopeType, + SiluAndMul, + build_rotary_embedding, + build_rotary_params, +) from lmdeploy.pytorch.nn.eplb import EPLBDispatchInfo, EPLBManager -from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_down_linear, build_gateup_linear, build_o_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_colwise_linear, + build_down_linear, + build_gateup_linear, + build_o_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.nn.moe import MoeType, SoftmaxTopK, build_fused_moe from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters, get_rope_theta from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight @@ -511,8 +525,8 @@ def _qkv_proj(self, hidden_states: torch.Tensor, num_heads: int): def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -836,11 +850,11 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, - ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: if residual is None: residual = hidden_states @@ -866,9 +880,9 @@ def forward( def forward_yield( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, tag: Any = None, ): @@ -987,10 +1001,10 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """forward.""" if inputs_embeds is None: @@ -1018,10 +1032,10 @@ def forward( def forward_microbatch( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """forward_microbatch.""" assert self.config.moe_layer_freq == 1 @@ -1071,9 +1085,9 @@ def forward_microbatch( def forward_yieldlayers(self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_values: Optional[List[torch.FloatTensor]] = None, - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_values: list[torch.FloatTensor] | None = None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, start_idx: int = -1, end_idx: int = -1, @@ -1120,7 +1134,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -1153,8 +1167,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -1170,8 +1184,8 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - expert_params_mapping: List): + def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + expert_params_mapping: list): """Load weight experts.""" for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping: if weight_name not in name: @@ -1184,8 +1198,8 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di param = params_dict[name] load_weight(param, loaded_weight) - def _load_weight_attention(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - update_pe_mapping: List): + def _load_weight_attention(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + update_pe_mapping: list): """Load weight attention.""" device = next(iter(params_dict.values())).device @@ -1277,7 +1291,7 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor): param = params_dict[name] load_weight(param, loaded_weight) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" def __skip_nextn(name, nextn_keys): diff --git a/lmdeploy/pytorch/models/deepseek_v32.py b/lmdeploy/pytorch/models/deepseek_v32.py index 19ee10d420..6954e47fca 100644 --- a/lmdeploy/pytorch/models/deepseek_v32.py +++ b/lmdeploy/pytorch/models/deepseek_v32.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Sequence, Tuple +from collections.abc import Sequence +from typing import Any import torch import torch.nn.functional as F @@ -7,15 +8,29 @@ from lmdeploy.pytorch.distributed import get_dist_manager, get_ep_world_rank from lmdeploy.pytorch.model_inputs import StepContextManager -from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType, build_rotary_embedding, - build_rotary_params) +from lmdeploy.pytorch.nn import ( + ApplyRotaryEmb, + Attention, + RMSNorm, + RopeType, + build_rotary_embedding, + build_rotary_params, +) from lmdeploy.pytorch.nn.eplb import EPLBManager from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_o_proj, build_rowwise_linear from lmdeploy.pytorch.nn.nsa import IndexerTopKFP8 from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters, get_rope_theta -from .deepseek_v2 import (DeepseekV2Attention, DeepseekV2BMM, DeepseekV2DecoderLayer, DeepseekV2ForCausalLM, - DeepseekV2MLP, DeepseekV2Model, DeepseekV2MoE, yarn_get_mscale) +from .deepseek_v2 import ( + DeepseekV2Attention, + DeepseekV2BMM, + DeepseekV2DecoderLayer, + DeepseekV2ForCausalLM, + DeepseekV2MLP, + DeepseekV2Model, + DeepseekV2MoE, + yarn_get_mscale, +) def rotate_activation(x: torch.Tensor) -> torch.Tensor: @@ -88,7 +103,7 @@ def forward(self, x: torch.Tensor, qr: torch.Tensor, freqs_cis: torch.Tensor, - index_cache: Tuple[torch.Tensor, torch.Tensor], + index_cache: tuple[torch.Tensor, torch.Tensor], attn_metadata: Any = None): q = self.wq_b(qr) q = q.unflatten(-1, (-1, self.head_dim)) @@ -270,7 +285,7 @@ def _qkv_proj(self, hidden_states: torch.Tensor, num_heads: int): def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], past_key_value: Sequence[torch.Tensor] = None, attn_metadata: Any = None, ): diff --git a/lmdeploy/pytorch/models/deepseek_vl2.py b/lmdeploy/pytorch/models/deepseek_vl2.py index 290b9a4fc0..69cb35fd7a 100644 --- a/lmdeploy/pytorch/models/deepseek_vl2.py +++ b/lmdeploy/pytorch/models/deepseek_vl2.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/main/deepseek_vl2/models/modeling_deepseek_vl_v2.py -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch import torch.nn as nn @@ -172,9 +173,9 @@ def _init_vision_module( def prepare_inputs_embeds(self, input_ids: torch.LongTensor, - images: Optional[torch.FloatTensor] = None, - images_seq_mask: Optional[torch.LongTensor] = None, - images_spatial_crop: Optional[torch.LongTensor] = None, + images: torch.FloatTensor | None = None, + images_seq_mask: torch.LongTensor | None = None, + images_spatial_crop: torch.LongTensor | None = None, **ignore_kwargs): """ @@ -306,7 +307,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, pixel_values: torch.Tensor = None, image_mask: torch.Tensor = None, @@ -340,7 +341,7 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], inputs_embeds: torch.Tensor = None, context: StepContext = None, ): @@ -382,7 +383,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" lang_prefix = 'language.' @@ -423,8 +424,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None: self.patch_size = vision_config.patch_size def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: diff --git a/lmdeploy/pytorch/models/gemma.py b/lmdeploy/pytorch/models/gemma.py index 4010dab882..ebf1804dee 100644 --- a/lmdeploy/pytorch/models/gemma.py +++ b/lmdeploy/pytorch/models/gemma.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import math -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch import torch.nn.functional as F @@ -9,10 +10,22 @@ from transformers.configuration_utils import PretrainedConfig from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager -from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, GeluAndMul, RMSNorm, RopeType, build_rotary_embedding, - build_rotary_embedding_from_config) -from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn import ( + ApplyRotaryEmb, + Attention, + GeluAndMul, + RMSNorm, + RopeType, + build_rotary_embedding, + build_rotary_embedding_from_config, +) +from lmdeploy.pytorch.nn.linear import ( + build_down_linear, + build_gateup_linear, + build_o_proj, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMixin @@ -96,9 +109,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - rotary_pos_emb_local: Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + rotary_pos_emb_local: tuple[torch.FloatTensor, torch.FloatTensor] | None = None, + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, global_attn_masks: torch.Tensor = None, local_attn_masks: torch.Tensor = None, @@ -290,10 +303,10 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - rotary_pos_emb_local: Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] = None, - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + rotary_pos_emb_local: tuple[torch.FloatTensor, torch.FloatTensor] | None = None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, global_attn_masks: torch.Tensor = None, local_attn_masks: torch.Tensor = None, @@ -340,7 +353,7 @@ def __init__(self, embedding_dim: int, padding_idx: int, dtype=torch.dtype, - embed_scale: Optional[float] = 1.0): + embed_scale: float | None = 1.0): super().__init__(num_embeddings, embedding_dim, padding_idx, dtype=dtype) self.embed_scale = embed_scale @@ -428,10 +441,10 @@ def build_rope_emb(self, config: PretrainedConfig): def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, global_attn_masks: torch.Tensor = None, local_attn_masks: torch.Tensor = None, ): @@ -517,7 +530,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, global_attn_masks: torch.Tensor = None, @@ -551,8 +564,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -582,7 +595,7 @@ def update_weights(self): """Update weights.""" self.lm_head.weight = self.model.embed_tokens.weight - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/gemma3_vl.py b/lmdeploy/pytorch/models/gemma3_vl.py index 8f4ea8e972..065628cdb6 100644 --- a/lmdeploy/pytorch/models/gemma3_vl.py +++ b/lmdeploy/pytorch/models/gemma3_vl.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -92,8 +93,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None: self.vision_token_num = self.num_patches // 4 def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: @@ -163,7 +164,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, pixel_values: torch.FloatTensor = None, image_mask: torch.Tensor = None, @@ -302,7 +303,7 @@ def prepare_inputs_for_generation( def tie_weights(self): return self.language_model.tie_weights() - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" stacked_params_mapping = [ # (param_name, shard_name, shard_id) diff --git a/lmdeploy/pytorch/models/glm4.py b/lmdeploy/pytorch/models/glm4.py index fe1193e76b..91644822c8 100644 --- a/lmdeploy/pytorch/models/glm4.py +++ b/lmdeploy/pytorch/models/glm4.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -7,8 +8,13 @@ from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config -from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_down_linear, + build_gateup_linear, + build_o_proj, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMixin @@ -71,8 +77,8 @@ def _fill_rope(states: torch.Tensor, rope: torch.Tensor): def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -199,9 +205,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): if residual is None: @@ -261,10 +267,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -321,7 +327,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -351,8 +357,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -378,7 +384,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/glm4_1v.py b/lmdeploy/pytorch/models/glm4_1v.py index 9b89164bef..6a7ea8795d 100644 --- a/lmdeploy/pytorch/models/glm4_1v.py +++ b/lmdeploy/pytorch/models/glm4_1v.py @@ -2,7 +2,8 @@ # adapted from: # https://github.com/huggingface/transformers/blob/main/src/transformers/models/glm4v/modeling_glm4v.py -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple +from collections.abc import Callable, Iterable +from typing import Any import torch import torch.nn.functional as F @@ -21,7 +22,7 @@ from .utils.model import DeployModelMixin, vlm_model -def _apply_mrope_selection(hidden_states: torch.Tensor, mrope_position_ids: torch.Tensor, mrope_section: List[int], +def _apply_mrope_selection(hidden_states: torch.Tensor, mrope_position_ids: torch.Tensor, mrope_section: list[int], position_ids: torch.Tensor, rotary_emb_func: Callable): _mrope_position_ids = torch.zeros(3, position_ids.shape[-1], dtype=position_ids.dtype, device=position_ids.device) _mrope_position_ids[:, :mrope_position_ids.shape[-1]] = mrope_position_ids @@ -71,10 +72,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, mrope_position_ids: torch.LongTensor = None, ): """Rewrite of LlamaModel.forward.""" @@ -361,7 +362,7 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: is_tp=True) def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor]) -> torch.Tensor: + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor]) -> torch.Tensor: seq_length = hidden_states.shape[0] # qkv proj qkv_states = self.qkv(hidden_states) @@ -400,7 +401,7 @@ def forward(self, hidden_states, cu_seqlens, rotary_pos_emb, - residual: Optional[torch.Tensor] = None) -> torch.Tensor: + residual: torch.Tensor | None = None) -> torch.Tensor: if residual is None: residual = hidden_states hidden_states = self.norm1(hidden_states) @@ -478,7 +479,7 @@ def rot_pos_emb(self, grid_thw): return rotary_pos_emb, pos_ids def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, - grid_thw: torch.Tensor, image_type_ids: List[torch.Tensor]) -> torch.Tensor: + grid_thw: torch.Tensor, image_type_ids: list[torch.Tensor]) -> torch.Tensor: """forward.""" hidden_states = self.patch_embed(hidden_states) hidden_states = self.post_conv_layernorm(hidden_states) @@ -551,14 +552,14 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, mrope_position_ids: torch.Tensor = None, pixel_values: torch.Tensor = None, vis_cu_seqlens: torch.Tensor = None, vis_pos_emb: torch.Tensor = None, - image_type_ids: List[torch.Tensor] = None, + image_type_ids: list[torch.Tensor] = None, grid_thw: torch.Tensor = None, image_mask: torch.Tensor = None, **kwargs, @@ -602,8 +603,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -672,7 +673,7 @@ def rename_weight(cls, name: str) -> str: return name[len('model.'):] return name - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ @@ -826,8 +827,8 @@ def _update_model_meta_prefilling(self, context: StepContext): return new_model_metas def update_model_metas(self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None): """Update model meta.""" if context.is_decoding: @@ -847,8 +848,8 @@ def __init__(self, config: PretrainedConfig) -> None: self.config = config def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: diff --git a/lmdeploy/pytorch/models/glm4_moe.py b/lmdeploy/pytorch/models/glm4_moe.py index 1c6b92ad05..671a4e6e9a 100644 --- a/lmdeploy/pytorch/models/glm4_moe.py +++ b/lmdeploy/pytorch/models/glm4_moe.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -78,8 +79,8 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -298,9 +299,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): if residual is None: @@ -358,10 +359,10 @@ def _build_rotary_embedding(self, config: PretrainedConfig): def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -438,7 +439,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -463,8 +464,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -490,8 +491,8 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - expert_params_mapping: List): + def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + expert_params_mapping: list): """Load weight experts.""" # load fused weights if any([k in name for k in ['fused_w1w3', 'fused_w2']]): @@ -508,7 +509,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di param = params_dict[name] load_weight(param, loaded_weight) - def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter]): + def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter]): """Load weight of fused expert weights.""" num_experts = self.config.num_experts fused_gateup_name = 'fused_w1w3' @@ -533,7 +534,7 @@ def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, par w2 = loaded_weight.narrow(dim=0, start=chunk_size * expert_id, length=chunk_size) load_weight(param, w2, expert_id=expert_id, shard_id='down') - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/glm4moe_mtp.py b/lmdeploy/pytorch/models/glm4moe_mtp.py index 743920e11e..4e5ec8c818 100644 --- a/lmdeploy/pytorch/models/glm4moe_mtp.py +++ b/lmdeploy/pytorch/models/glm4moe_mtp.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Iterable +from collections.abc import Iterable import torch from torch import nn diff --git a/lmdeploy/pytorch/models/gpt_oss.py b/lmdeploy/pytorch/models/gpt_oss.py index 0caf956675..51f83af244 100644 --- a/lmdeploy/pytorch/models/gpt_oss.py +++ b/lmdeploy/pytorch/models/gpt_oss.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import functools -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple +from collections.abc import Callable, Iterable +from typing import Any import torch import torch.nn.functional as F @@ -104,8 +105,8 @@ def weight_loader_sinks(cls, param: nn.Parameter, loaded_weight: torch.Tensor): def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of forward.""" @@ -160,7 +161,7 @@ def _impl(self, gateup: torch.Tensor) -> torch.Tensor: return (up + 1) * glu @staticmethod - @functools.lru_cache(maxsize=None) + @functools.cache def build(limit: float, alpha: float): return GateupAct(limit, alpha) @@ -306,9 +307,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, all_routed_experts: torch.Tensor = None, ): @@ -363,10 +364,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, all_routed_experts: torch.Tensor = None, ): """Rewrite of forward.""" @@ -437,7 +438,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -472,8 +473,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -499,7 +500,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def _load_weight_experts_gate_up(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, + def _load_weight_experts_gate_up(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter]): """Load weight of experts gate up.""" num_experts = self.config.num_local_experts @@ -517,7 +518,7 @@ def _load_weight_experts_gate_up(self, name: str, loaded_weight: torch.Tensor, p load_weight(param, w1, expert_id=expert_id, shard_id='gate') load_weight(param, w3, expert_id=expert_id, shard_id='up') - def _load_weight_experts_down(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter]): + def _load_weight_experts_down(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter]): """Load weight of experts down.""" num_experts = self.config.num_local_experts @@ -532,7 +533,7 @@ def _load_weight_experts_down(self, name: str, loaded_weight: torch.Tensor, para w2 = loaded_weight[expert_id] load_weight(param, w2, expert_id=expert_id, shard_id='down') - def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter]): + def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter]): """Load weight of fused expert weights.""" if 'gate_up' in name: self._load_weight_experts_gate_up(name, loaded_weight, params_dict) @@ -540,7 +541,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di elif 'down' in name: self._load_weight_experts_down(name, loaded_weight, params_dict) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/internlm.py b/lmdeploy/pytorch/models/internlm.py index dcd0e40a76..ad53d6970b 100644 --- a/lmdeploy/pytorch/models/internlm.py +++ b/lmdeploy/pytorch/models/internlm.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -8,8 +9,13 @@ from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding -from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_down_linear, + build_gateup_linear, + build_o_proj, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMixin @@ -60,8 +66,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -171,9 +177,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -248,10 +254,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -324,7 +330,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -349,8 +355,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -376,7 +382,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/internlm2.py b/lmdeploy/pytorch/models/internlm2.py index 661ce2ddbc..acf54e40ef 100644 --- a/lmdeploy/pytorch/models/internlm2.py +++ b/lmdeploy/pytorch/models/internlm2.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -61,8 +62,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of InternLM2Attention.forward.""" @@ -172,9 +173,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -230,10 +231,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of forward.""" @@ -298,7 +299,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -319,8 +320,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -346,7 +347,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_lora_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], adapter_id: int): + def load_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]], adapter_id: int): """Load lora weights.""" from lmdeploy.pytorch.adapter.adapter import load_lora_weights @@ -370,7 +371,7 @@ def _rearange_wqkv(weights): weights_iter = _rearange_wqkv(weights) load_lora_weights(self, weights_iter, adapter_id) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/internlm2_reward.py b/lmdeploy/pytorch/models/internlm2_reward.py index 886c884d6a..a393b57135 100644 --- a/lmdeploy/pytorch/models/internlm2_reward.py +++ b/lmdeploy/pytorch/models/internlm2_reward.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -41,7 +42,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -66,8 +67,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -89,7 +90,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_lora_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], adapter_id: int): + def load_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]], adapter_id: int): """Load lora weights.""" from lmdeploy.pytorch.adapter.adapter import load_lora_weights @@ -113,7 +114,7 @@ def _rearange_wqkv(weights): weights_iter = _rearange_wqkv(weights) load_lora_weights(self, weights_iter, adapter_id) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/internlm2_ve.py b/lmdeploy/pytorch/models/internlm2_ve.py index 3e5ace9016..1d084eb4ab 100644 --- a/lmdeploy/pytorch/models/internlm2_ve.py +++ b/lmdeploy/pytorch/models/internlm2_ve.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -54,12 +55,12 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, - vision_embedding_indexing: Optional[torch.Tensor] = None, - text_embedding_indexing: Optional[torch.Tensor] = None, + vision_embedding_indexing: torch.Tensor | None = None, + text_embedding_indexing: torch.Tensor | None = None, ): if residual is None: @@ -141,12 +142,12 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - vision_embedding_indexing: Optional[torch.Tensor] = None, - text_embedding_indexing: Optional[torch.Tensor] = None, + inputs_embeds: torch.FloatTensor | None = None, + vision_embedding_indexing: torch.Tensor | None = None, + text_embedding_indexing: torch.Tensor | None = None, ): """Rewrite of forward.""" @@ -216,11 +217,11 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, - vision_embedding_indexing: Optional[torch.Tensor] = None, - text_embedding_indexing: Optional[torch.Tensor] = None, + vision_embedding_indexing: torch.Tensor | None = None, + text_embedding_indexing: torch.Tensor | None = None, **kwargs, ): """Model forward, return logits.""" @@ -259,8 +260,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -286,7 +287,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/internlm3.py b/lmdeploy/pytorch/models/internlm3.py index 3005232c06..0f1af8f769 100644 --- a/lmdeploy/pytorch/models/internlm3.py +++ b/lmdeploy/pytorch/models/internlm3.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -62,8 +63,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of InternLM3Attention.forward.""" @@ -174,9 +175,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -233,10 +234,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of InternLM3Model.forward.""" @@ -305,7 +306,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -326,8 +327,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -353,7 +354,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/interns1_pro.py b/lmdeploy/pytorch/models/interns1_pro.py index 51ed9deaf6..a986a49887 100644 --- a/lmdeploy/pytorch/models/interns1_pro.py +++ b/lmdeploy/pytorch/models/interns1_pro.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -80,7 +81,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, pixel_values: torch.Tensor = None, @@ -150,8 +151,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -239,8 +240,8 @@ def rename_weight(cls, name: str) -> str: return name[len('model.'):] return name - def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - expert_params_mapping: List): + def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + expert_params_mapping: list): """Load weight experts.""" for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping: @@ -255,8 +256,8 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di load_weight(param, loaded_weight) # modify from vllm qwen3vlmoe fused expert loading - def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - fused_expert_params_mapping: List): + def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + fused_expert_params_mapping: list): """Load weight of fused expert weights.""" num_experts = self.config.text_config.num_experts @@ -279,7 +280,7 @@ def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, par for expert_id in range(num_experts): load_weight(param, w2[expert_id], expert_id=expert_id, shard_id='down') - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ @@ -366,8 +367,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype) -> None: self.dtype = dtype def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: diff --git a/lmdeploy/pytorch/models/interns1_pro_ts.py b/lmdeploy/pytorch/models/interns1_pro_ts.py index 48ba00fcef..1bc4ecd910 100644 --- a/lmdeploy/pytorch/models/interns1_pro_ts.py +++ b/lmdeploy/pytorch/models/interns1_pro_ts.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import math -from typing import Optional, Tuple, Union import torch from torch import nn @@ -224,7 +223,7 @@ def forward_encoder(self, x): # conv1 # treat each channel as an independent sample and feed it into conv1 x = x.reshape(num_patch * C, 1, patch_len) - x = nn.functional.relu((self.conv(x))) # [B*C, D1, L] + x = nn.functional.relu(self.conv(x)) # [B*C, D1, L] x = x.permute(2, 0, 1) # [L, B*C, D1] x = self.pos_encoder(x) # [L, B*C, D1] @@ -272,11 +271,11 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, - time_series_signals: Optional[torch.FloatTensor] = None, - ts_lens: Optional[torch.Tensor] = None, - sr: Optional[torch.Tensor] = None, - time_series_embeds: Optional[torch.FloatTensor] = None, - ) -> Union[Tuple]: + time_series_signals: torch.FloatTensor | None = None, + ts_lens: torch.Tensor | None = None, + sr: torch.Tensor | None = None, + time_series_embeds: torch.FloatTensor | None = None, + ) -> tuple: if time_series_signals is None and time_series_embeds is None: raise ValueError('You have to specify time_series_signals or time_series_embeds') diff --git a/lmdeploy/pytorch/models/internvl.py b/lmdeploy/pytorch/models/internvl.py index 5b6c261dd2..5f4103fe92 100644 --- a/lmdeploy/pytorch/models/internvl.py +++ b/lmdeploy/pytorch/models/internvl.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch import torch.nn.functional as F @@ -277,7 +278,7 @@ def post_rms_norm(self, q: torch.Tensor, k: torch.Tensor, variance: torch.Tensor eps = self.config.layer_norm_eps return post_rms_norm(q, k, self.q_norm.weight, self.k_norm.weight, variance, eps, self.embed_dim, dtype) - def qkv_norm(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + def qkv_norm(self, q: torch.Tensor, k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: import lmdeploy.pytorch.distributed as dist q_shape = q.shape k_shape = k.shape @@ -432,7 +433,7 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, - pixel_values: Optional[torch.FloatTensor] = None, + pixel_values: torch.FloatTensor | None = None, ): """forward.""" assert pixel_values.dim() == 4 @@ -713,7 +714,7 @@ def extract_and_compress(self, pixel_values: torch.Tensor, input_ids: torch.Tens return vit_embeds, new_lang_embeds, new_input_ids, new_image_mask, new_seq_lengths - def update_forward_inputs(self, input_ids: torch.Tensor, new_seqlens: List[int], + def update_forward_inputs(self, input_ids: torch.Tensor, new_seqlens: list[int], context: StepContext) -> StepContext: """Update the forward inputs, position_ids and attention metadata.""" from lmdeploy.pytorch.model_inputs import ModelInputs @@ -758,7 +759,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, pixel_values: torch.Tensor = None, image_mask: torch.Tensor = None, @@ -808,7 +809,7 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], inputs_embeds: torch.Tensor = None, context: StepContext = None, ): @@ -921,7 +922,7 @@ def prepare_inputs_for_generation( image_token_id=image_token_id, context=context) - def load_lora_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], adapter_id: int): + def load_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]], adapter_id: int): """Load lora weights.""" if hasattr(self.language_model, 'load_lora_weights'): @@ -931,7 +932,7 @@ def load_lora_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], adapter return load_lora_weights(weights, adapter_id) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" lang_prefix = 'language_model.' @@ -976,8 +977,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None: self.vision_token_num = self.num_patches // 4 def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: diff --git a/lmdeploy/pytorch/models/internvl3_hf.py b/lmdeploy/pytorch/models/internvl3_hf.py index 7cd4cd940c..9a13d3c227 100644 --- a/lmdeploy/pytorch/models/internvl3_hf.py +++ b/lmdeploy/pytorch/models/internvl3_hf.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from collections.abc import Iterable +from typing import Any import torch import torch.nn.functional as F @@ -218,7 +219,7 @@ def post_rms_norm(self, q: torch.Tensor, k: torch.Tensor, variance: torch.Tensor eps = self.config.layer_norm_eps return post_rms_norm(q, k, self.q_norm.weight, self.k_norm.weight, variance, eps, self.embed_dim, dtype) - def qkv_norm(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + def qkv_norm(self, q: torch.Tensor, k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: import lmdeploy.pytorch.distributed as dist q_shape = q.shape k_shape = k.shape @@ -387,7 +388,7 @@ def get_input_embeddings(self): def forward( self, - pixel_values: Optional[torch.FloatTensor] = None, + pixel_values: torch.FloatTensor | None = None, ): """forward.""" assert pixel_values.dim() == 4 @@ -493,7 +494,7 @@ def get_input_embeddings(self): def get_image_features( self, pixel_values: torch.FloatTensor, - vision_feature_layer: Union[int, List[int]], + vision_feature_layer: int | list[int], vision_feature_select_strategy: str, **kwargs, ): @@ -503,7 +504,7 @@ def get_image_features( Args: pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) The tensors corresponding to the input images. - vision_feature_layer (`int` or `List[int]`): + vision_feature_layer (`int` or `list[int]`): Layer index or list of layer indices to extract features from. Returns: vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`. @@ -574,7 +575,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, pixel_values: torch.Tensor = None, image_mask: torch.Tensor = None, @@ -610,7 +611,7 @@ def forward( def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], inputs_embeds: torch.Tensor = None, context: StepContext = None, ): @@ -653,7 +654,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_lora_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], adapter_id: int): + def load_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]], adapter_id: int): """Load lora weights.""" if hasattr(self.model.language_model, 'load_lora_weights'): @@ -674,7 +675,7 @@ def rename_weight(cls, name: str) -> str: return name[len('model.'):] return name - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" lang_prefix = 'language_model.' @@ -720,8 +721,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None: self.dtype = dtype def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: diff --git a/lmdeploy/pytorch/models/internvl_patch.py b/lmdeploy/pytorch/models/internvl_patch.py index 5f25c0dd85..1a53bc68ce 100644 --- a/lmdeploy/pytorch/models/internvl_patch.py +++ b/lmdeploy/pytorch/models/internvl_patch.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional import torch import torch.nn.functional as F @@ -65,7 +64,7 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, - pixel_values: Optional[torch.FloatTensor] = None, + pixel_values: torch.FloatTensor | None = None, ): if len(pixel_values.shape) != 4: raise ValueError(f'wrong pixel_values size: {pixel_values.shape}') diff --git a/lmdeploy/pytorch/models/llama.py b/lmdeploy/pytorch/models/llama.py index 6ab07b9c11..c7fd6354ac 100644 --- a/lmdeploy/pytorch/models/llama.py +++ b/lmdeploy/pytorch/models/llama.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -8,8 +9,13 @@ from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config -from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_down_linear, + build_gateup_linear, + build_o_proj, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMixin @@ -63,8 +69,8 @@ def __init__(self, config: LlamaConfig, dtype: torch.dtype = None, device: torch def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -176,9 +182,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -223,7 +229,7 @@ def __init__(self, config: LlamaConfig, dtype: torch.dtype = None, device: torch LlamaDecoderLayer(config, layer_idx, dtype=dtype, device=device) for layer_idx in range(config.num_hidden_layers) ]) - self.aux_hidden_state_layers: Tuple[int] = getattr(config, 'aux_hidden_state_layers', tuple()) + self.aux_hidden_state_layers: tuple[int] = getattr(config, 'aux_hidden_state_layers', tuple()) # build norm self.norm = RMSNorm(config.hidden_size, config.rms_norm_eps, dtype=dtype, device=device) @@ -233,10 +239,10 @@ def __init__(self, config: LlamaConfig, dtype: torch.dtype = None, device: torch def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -317,7 +323,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -346,7 +352,7 @@ def get_input_embeddings(self): """Get input embeddings.""" return self.model.get_input_embeddings() - def get_outputs_cudagraph(self, output_buffers: Dict[str, torch.Tensor], input_ids: torch.Tensor, **kwargs): + def get_outputs_cudagraph(self, output_buffers: dict[str, torch.Tensor], input_ids: torch.Tensor, **kwargs): """Get outputs from buffers.""" num_tokens = input_ids.size(-1) outputs = dict() @@ -357,8 +363,8 @@ def get_outputs_cudagraph(self, output_buffers: Dict[str, torch.Tensor], input_i def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -384,7 +390,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/llama4.py b/lmdeploy/pytorch/models/llama4.py index 4b3c2196bc..5ffc42c8e9 100644 --- a/lmdeploy/pytorch/models/llama4.py +++ b/lmdeploy/pytorch/models/llama4.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -10,8 +11,12 @@ from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config -from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_colwise_linear, + build_merged_colwise_linear, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.nn.moe import build_fused_moe from lmdeploy.pytorch.nn.rotary_embedding import get_rope_theta from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight @@ -77,8 +82,8 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """forward.""" @@ -271,9 +276,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): """forward.""" @@ -331,7 +336,7 @@ def forward( self, inputs_embeds: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, **kwargs, ): @@ -382,7 +387,7 @@ def forward( self, inputs_embeds: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, **kwargs, ): @@ -482,7 +487,7 @@ def vision_apply_rotary_emb( query: torch.Tensor, key: torch.Tensor, freqs_ci: torch.Tensor, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2)) key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2)) freqs_ci = reshape_for_broadcast(freqs_ci=freqs_ci, query=query_) # freqs_ci[:,:,None,:] @@ -849,7 +854,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, pixel_values: torch.FloatTensor = None, image_mask: torch.Tensor = None, @@ -882,8 +887,8 @@ def get_logits(self, hidden_states: torch.Tensor): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -918,7 +923,7 @@ def prepare_inputs_for_generation( image_mask=image_mask, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" def _load_experts_bf16(name, loaded_weight): @@ -1016,8 +1021,8 @@ def __init__(self, config: Llama4Config, dtype) -> None: self.vision_config = config.vision_config def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" diff --git a/lmdeploy/pytorch/models/llama_eagle.py b/lmdeploy/pytorch/models/llama_eagle.py index d581e1ff3d..02ddd91d10 100644 --- a/lmdeploy/pytorch/models/llama_eagle.py +++ b/lmdeploy/pytorch/models/llama_eagle.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -62,11 +63,11 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - previous_hidden_states: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, + previous_hidden_states: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" # token embedding @@ -126,7 +127,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, target_hidden_states: torch.Tensor = None, @@ -145,8 +146,8 @@ def forward( def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -205,7 +206,7 @@ def get_input_embeddings(self): """Get input embeddings.""" return self.model.get_input_embeddings() - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" stacked_params_mapping = [ # (param_name, shard_name, shard_id) diff --git a/lmdeploy/pytorch/models/llama_eagle3.py b/lmdeploy/pytorch/models/llama_eagle3.py index b37a2337bf..eb683c52ac 100644 --- a/lmdeploy/pytorch/models/llama_eagle3.py +++ b/lmdeploy/pytorch/models/llama_eagle3.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -51,8 +52,8 @@ def forward( self, embeds: torch.Tensor, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, attn_metadata: Any = None, ): @@ -109,11 +110,11 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - previous_hidden_states: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, + previous_hidden_states: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" # token embedding @@ -189,7 +190,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, target_hidden_states: torch.Tensor = None, @@ -208,8 +209,8 @@ def forward( def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -262,7 +263,7 @@ def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta, **kwargs): return new_inputs - def get_outputs_cudagraph(self, output_buffers: Dict[str, torch.Tensor], input_ids: torch.Tensor, **kwargs): + def get_outputs_cudagraph(self, output_buffers: dict[str, torch.Tensor], input_ids: torch.Tensor, **kwargs): """Get outputs from buffers.""" num_tokens = input_ids.size(-1) outputs = dict() @@ -274,7 +275,7 @@ def get_input_embeddings(self): """Get input embeddings.""" return self.model.get_input_embeddings() - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" stacked_params_mapping = [ # (param_name, shard_name, shard_id) diff --git a/lmdeploy/pytorch/models/llava.py b/lmdeploy/pytorch/models/llava.py index e87242df4c..cb14bc443d 100644 --- a/lmdeploy/pytorch/models/llava.py +++ b/lmdeploy/pytorch/models/llava.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch import torch.nn.functional as F @@ -174,8 +175,8 @@ def __init__(self, config, dtype: torch.dtype = None, device: torch.device = Non def forward( self, hidden_states, - attention_mask: Optional[torch.Tensor] = None, - causal_attention_mask: Optional[torch.Tensor] = None, + attention_mask: torch.Tensor | None = None, + causal_attention_mask: torch.Tensor | None = None, ): """forward.""" # qkv proj @@ -287,8 +288,8 @@ def __init__(self, config, dtype: torch.dtype = None, device: torch.device = Non def forward( self, inputs_embeds, - attention_mask: Optional[torch.Tensor] = None, - causal_attention_mask: Optional[torch.Tensor] = None, + attention_mask: torch.Tensor | None = None, + causal_attention_mask: torch.Tensor | None = None, vision_feature_layer: int = -1, ): """forward.""" @@ -414,7 +415,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, pixel_values: torch.Tensor = None, image_mask: torch.Tensor = None, @@ -449,7 +450,7 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], inputs_embeds: torch.Tensor = None, context: StepContext = None, ): @@ -492,7 +493,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" stacked_params_mapping = [ @@ -539,8 +540,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None: self.dtype = dtype def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: @@ -721,7 +722,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, pixel_values: torch.Tensor = None, image_sizes: torch.Tensor = None, @@ -762,7 +763,7 @@ def get_input_processor(self) -> BaseModelInputProcessor: def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], inputs_embeds: torch.Tensor = None, context: StepContext = None, ): @@ -817,8 +818,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None: self.dtype = dtype def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: diff --git a/lmdeploy/pytorch/models/minicpm3.py b/lmdeploy/pytorch/models/minicpm3.py index 6513a3ff46..9dec51f076 100644 --- a/lmdeploy/pytorch/models/minicpm3.py +++ b/lmdeploy/pytorch/models/minicpm3.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import math -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -11,8 +12,12 @@ from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.nn import Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_merged_colwise_linear, build_rowwise_linear -from lmdeploy.pytorch.nn.rotary_embedding import (ApplyRotaryEmb, LongRoPEScalingParameters, get_rope_parameters, - get_rope_theta) +from lmdeploy.pytorch.nn.rotary_embedding import ( + ApplyRotaryEmb, + LongRoPEScalingParameters, + get_rope_parameters, + get_rope_theta, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMixin @@ -107,8 +112,8 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -243,8 +248,8 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, attn_metadata: Any = None, ): @@ -328,10 +333,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -396,7 +401,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -424,8 +429,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -451,7 +456,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/minicpmv26.py b/lmdeploy/pytorch/models/minicpmv26.py index fa049bbb98..f72549f7dc 100644 --- a/lmdeploy/pytorch/models/minicpmv26.py +++ b/lmdeploy/pytorch/models/minicpmv26.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -60,8 +61,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -171,9 +172,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -228,10 +229,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -299,7 +300,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -330,8 +331,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -357,7 +358,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/mistral.py b/lmdeploy/pytorch/models/mistral.py index 4c8304400f..e3baee0417 100644 --- a/lmdeploy/pytorch/models/mistral.py +++ b/lmdeploy/pytorch/models/mistral.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -8,8 +9,13 @@ from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config -from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_down_linear, + build_gateup_linear, + build_o_proj, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMixin @@ -64,8 +70,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -175,9 +181,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -232,10 +238,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -308,7 +314,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -333,8 +339,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -360,7 +366,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/mixtral.py b/lmdeploy/pytorch/models/mixtral.py index 434a31f234..f770877f91 100644 --- a/lmdeploy/pytorch/models/mixtral.py +++ b/lmdeploy/pytorch/models/mixtral.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -62,8 +63,8 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -178,9 +179,9 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -231,10 +232,10 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """forward.""" if inputs_embeds is None: @@ -288,7 +289,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -312,8 +313,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -329,7 +330,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/patch.py b/lmdeploy/pytorch/models/patch.py index 0842904129..92d29e1d13 100644 --- a/lmdeploy/pytorch/models/patch.py +++ b/lmdeploy/pytorch/models/patch.py @@ -6,7 +6,7 @@ import os.path as osp import re import sys -from typing import Any, Dict +from typing import Any import torch from transformers.configuration_utils import PretrainedConfig @@ -21,7 +21,7 @@ logger = get_logger('lmdeploy') -def _get_rewrite_qualname(origin_qualname: str, module_map: Dict[str, str]) -> str: +def _get_rewrite_qualname(origin_qualname: str, module_map: dict[str, str]) -> str: """Get rewrite module from origin module name. Args: @@ -58,7 +58,7 @@ def _class_from_qualname(qualname: str) -> Any: return cls_type -def _find_rewrite_module_qualname(model, module_map: Dict[str, str]): +def _find_rewrite_module_qualname(model, module_map: dict[str, str]): """Find rewrite module.""" module_name = inspect.getmodule(model).__name__ class_name = model.__class__.__name__ @@ -93,7 +93,7 @@ def _find_submodulename(): return rewrite_qualname -def get_rewrite_cls(model: torch.nn.Module, module_map: Dict[str, str] = None): +def get_rewrite_cls(model: torch.nn.Module, module_map: dict[str, str] = None): """Get rewrite cls.""" if module_map is None: module_map = _get_module_map() @@ -133,13 +133,13 @@ def update_custom_module_map(module_map_path: str): if hasattr(custom_mod, 'MODULE_MAP'): has_map = True mod_map = custom_mod.MODULE_MAP - assert isinstance(mod_map, Dict) + assert isinstance(mod_map, dict) new_mod_map.update(mod_map) if hasattr(custom_mod, 'CUSTOM_MODULE_MAP'): has_map = True mod_map = custom_mod.CUSTOM_MODULE_MAP - assert isinstance(mod_map, Dict) + assert isinstance(mod_map, dict) new_mod_map.update(mod_map) if not has_map: @@ -216,7 +216,7 @@ def build_patched_model(config: ModelConfig, device: torch.device = None, build_ @torch.inference_mode() def add_adapters(model: torch.nn.Module, - adapters: Dict[str, str], + adapters: dict[str, str], dtype: torch.dtype = torch.float16, device: torch.device = None): """Add adapters.""" diff --git a/lmdeploy/pytorch/models/phi3.py b/lmdeploy/pytorch/models/phi3.py index c49b01737d..2f9a34a866 100644 --- a/lmdeploy/pytorch/models/phi3.py +++ b/lmdeploy/pytorch/models/phi3.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -64,8 +65,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -175,9 +176,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -232,10 +233,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -299,7 +300,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -320,8 +321,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -347,7 +348,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm diff --git a/lmdeploy/pytorch/models/phi3_moe.py b/lmdeploy/pytorch/models/phi3_moe.py index 2f60b16788..942e312757 100644 --- a/lmdeploy/pytorch/models/phi3_moe.py +++ b/lmdeploy/pytorch/models/phi3_moe.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -9,8 +10,12 @@ from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, LayerNorm, RopeType from lmdeploy.pytorch.nn.linear import build_qkv_proj, build_rowwise_linear from lmdeploy.pytorch.nn.moe import build_fused_moe -from lmdeploy.pytorch.nn.rotary_embedding import (LongRoPEScalingParameters, build_rotary_embedding, - get_rope_parameters, get_rope_theta) +from lmdeploy.pytorch.nn.rotary_embedding import ( + LongRoPEScalingParameters, + build_rotary_embedding, + get_rope_parameters, + get_rope_theta, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMixin @@ -112,8 +117,8 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -223,9 +228,9 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): if residual is None: @@ -304,10 +309,10 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """forward.""" if inputs_embeds is None: @@ -360,7 +365,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -384,8 +389,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -401,7 +406,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/phi3_v.py b/lmdeploy/pytorch/models/phi3_v.py index c6804d5586..0e83a8edf0 100644 --- a/lmdeploy/pytorch/models/phi3_v.py +++ b/lmdeploy/pytorch/models/phi3_v.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -236,13 +237,13 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - pixel_values: Optional[torch.FloatTensor] = None, - image_sizes: Optional[torch.LongTensor] = None, + pixel_values: torch.FloatTensor | None = None, + image_sizes: torch.LongTensor | None = None, image_mask: torch.Tensor = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -284,7 +285,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, pixel_values: torch.Tensor = None, image_sizes: torch.Tensor = None, @@ -307,7 +308,7 @@ def forward( def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], inputs_embeds: torch.Tensor = None, context: StepContext = None, ): @@ -333,7 +334,7 @@ def prepare_inputs_for_generation( return output - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" import itertools @@ -362,8 +363,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None: self.dtype = dtype def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: diff --git a/lmdeploy/pytorch/models/q_modules.py b/lmdeploy/pytorch/models/q_modules.py index 36f9506327..c3ad77db34 100644 --- a/lmdeploy/pytorch/models/q_modules.py +++ b/lmdeploy/pytorch/models/q_modules.py @@ -5,8 +5,12 @@ import torch import torch.nn as nn -from ..kernels.w8a8_triton_kernels import (matmul_kernel_dynamic_quant, per_channel_quant, per_token_quant_int8, - rms_norm_dynamic_quant) +from ..kernels.w8a8_triton_kernels import ( + matmul_kernel_dynamic_quant, + per_channel_quant, + per_token_quant_int8, + rms_norm_dynamic_quant, +) @dataclass diff --git a/lmdeploy/pytorch/models/qwen.py b/lmdeploy/pytorch/models/qwen.py index 6f7020abf4..650222c4b3 100644 --- a/lmdeploy/pytorch/models/qwen.py +++ b/lmdeploy/pytorch/models/qwen.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -8,8 +9,13 @@ from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding -from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_down_linear, + build_gateup_linear, + build_o_proj, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMixin @@ -65,8 +71,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -183,9 +189,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -245,10 +251,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """forward.""" @@ -317,7 +323,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -342,8 +348,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -369,7 +375,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/qwen2.py b/lmdeploy/pytorch/models/qwen2.py index ddd5b4dec9..963c5d33b4 100644 --- a/lmdeploy/pytorch/models/qwen2.py +++ b/lmdeploy/pytorch/models/qwen2.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -61,8 +62,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -172,9 +173,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -231,10 +232,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -303,7 +304,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -324,8 +325,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -351,7 +352,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/qwen2_5_vl.py b/lmdeploy/pytorch/models/qwen2_5_vl.py index 9c19a7de21..cb547a5793 100644 --- a/lmdeploy/pytorch/models/qwen2_5_vl.py +++ b/lmdeploy/pytorch/models/qwen2_5_vl.py @@ -2,7 +2,8 @@ # adapted from: # https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch import torch.nn.functional as F @@ -118,7 +119,7 @@ def __init__(self, ) def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor]) -> torch.Tensor: + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor]) -> torch.Tensor: seq_length = hidden_states.shape[0] # qkv proj qkv_states = self.qkv(hidden_states) @@ -198,7 +199,7 @@ def __init__(self, def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: Optional[torch.Tensor] = None) -> torch.Tensor: + rotary_pos_emb: torch.Tensor | None = None) -> torch.Tensor: hidden_states = hidden_states + self.attn( self.norm1(hidden_states), cu_seqlens=cu_seqlens, @@ -341,7 +342,7 @@ def forward(self, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, window_index: torch.Tensor = None, - cu_window_seqlens: List = None) -> torch.Tensor: + cu_window_seqlens: list = None) -> torch.Tensor: """forward.""" hidden_states = self.patch_embed(hidden_states) @@ -417,7 +418,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, mrope_position_ids: torch.Tensor = None, @@ -425,7 +426,7 @@ def forward( vis_cu_seqlens: torch.Tensor = None, vis_pos_emb: torch.Tensor = None, window_index: torch.Tensor = None, - cu_window_seqlens: List = None, + cu_window_seqlens: list = None, image_mask: torch.Tensor = None, **kwargs, ): @@ -458,8 +459,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -526,7 +527,7 @@ def prepare_inputs_for_generation( image_mask=image_mask, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ @@ -673,8 +674,8 @@ def _update_model_meta_prefilling(self, context: StepContext): return new_model_metas def update_model_metas(self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None): """Update model meta.""" if context.is_decoding: @@ -687,7 +688,7 @@ def get_input_processor(self) -> BaseModelInputProcessor: return self.input_processor -InputMultiModalType = List[Dict[str, Any]] +InputMultiModalType = list[dict[str, Any]] class Qwen2_5_VLInputProcessor(BaseModelInputProcessor): @@ -697,8 +698,8 @@ def __init__(self, config: PretrainedConfig) -> None: self.config = config def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: diff --git a/lmdeploy/pytorch/models/qwen2_moe.py b/lmdeploy/pytorch/models/qwen2_moe.py index c86608fc12..aeb7b9efea 100644 --- a/lmdeploy/pytorch/models/qwen2_moe.py +++ b/lmdeploy/pytorch/models/qwen2_moe.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch import torch.nn.functional as F @@ -66,8 +67,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -274,9 +275,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -333,10 +334,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -405,7 +406,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -426,8 +427,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -453,8 +454,8 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - expert_params_mapping: List): + def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + expert_params_mapping: list): """Load weight experts.""" for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping: if weight_name not in name: @@ -467,7 +468,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di param = params_dict[name] load_weight(param, loaded_weight) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/qwen2_reward.py b/lmdeploy/pytorch/models/qwen2_reward.py index f65fa8fe7a..aaefcc36ed 100644 --- a/lmdeploy/pytorch/models/qwen2_reward.py +++ b/lmdeploy/pytorch/models/qwen2_reward.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -54,7 +55,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -84,8 +85,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -103,7 +104,7 @@ def prepare_inputs_for_generation( # inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/qwen2_vl.py b/lmdeploy/pytorch/models/qwen2_vl.py index 605f8ded76..1d3bfd0bc1 100644 --- a/lmdeploy/pytorch/models/qwen2_vl.py +++ b/lmdeploy/pytorch/models/qwen2_vl.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple +from collections.abc import Callable, Iterable +from typing import Any import torch from torch import nn @@ -9,17 +10,28 @@ from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor -from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, FlashAttention, LayerNorm, RMSNorm, SiluAndMul, - build_rotary_embedding_from_config) -from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn import ( + ApplyRotaryEmb, + Attention, + FlashAttention, + LayerNorm, + RMSNorm, + SiluAndMul, + build_rotary_embedding_from_config, +) +from lmdeploy.pytorch.nn.linear import ( + build_colwise_linear, + build_merged_colwise_linear, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMeta, CudaGraphMixin from .utils.model import DeployModelMixinV1, build_embedding, vlm_model -def _apply_mrope_selection(hidden_states: torch.Tensor, mrope_position_ids: torch.Tensor, mrope_section: List[int], +def _apply_mrope_selection(hidden_states: torch.Tensor, mrope_position_ids: torch.Tensor, mrope_section: list[int], position_ids: torch.Tensor, rotary_emb_func: Callable): _mrope_position_ids = torch.zeros(3, position_ids.shape[-1], dtype=position_ids.dtype, device=position_ids.device) _mrope_position_ids[:, :mrope_position_ids.shape[-1]] = mrope_position_ids @@ -87,8 +99,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -198,9 +210,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -258,10 +270,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, mrope_position_ids: torch.LongTensor = None, ): """Rewrite of LlamaModel.forward.""" @@ -393,7 +405,7 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: is_tp=True) def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor]) -> torch.Tensor: + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor]) -> torch.Tensor: seq_length = hidden_states.shape[0] # qkv proj qkv_states = self.qkv(hidden_states) @@ -480,7 +492,7 @@ def forward(self, hidden_states, cu_seqlens, rotary_pos_emb, - residual: Optional[torch.Tensor] = None) -> torch.Tensor: + residual: torch.Tensor | None = None) -> torch.Tensor: if residual is None: residual = hidden_states hidden_states = self.norm1(hidden_states) @@ -638,7 +650,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, mrope_position_ids: torch.Tensor = None, @@ -674,8 +686,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -729,7 +741,7 @@ def prepare_inputs_for_generation( image_mask=image_mask, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ @@ -876,8 +888,8 @@ def _update_model_meta_prefilling(self, context: StepContext): return new_model_metas def update_model_metas(self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None): """Update model meta.""" if context.is_decoding: @@ -890,7 +902,7 @@ def get_input_processor(self) -> BaseModelInputProcessor: return self.input_processor -InputMultiModalType = List[Dict[str, Any]] +InputMultiModalType = list[dict[str, Any]] class Qwen2VLInputProcessor(BaseModelInputProcessor): @@ -900,8 +912,8 @@ def __init__(self, config: PretrainedConfig) -> None: self.config = config def preprocess_input(self, - input_ids: List[int], - input_multimodals: List[Dict[str, Any]] = None, + input_ids: list[int], + input_multimodals: list[dict[str, Any]] = None, **kwargs) -> PreprocessInputResult: """Prepare multimodal input.""" if input_multimodals is None or len(input_multimodals) == 0: diff --git a/lmdeploy/pytorch/models/qwen3.py b/lmdeploy/pytorch/models/qwen3.py index e69efefda5..11326d3225 100644 --- a/lmdeploy/pytorch/models/qwen3.py +++ b/lmdeploy/pytorch/models/qwen3.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -77,8 +78,8 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -207,9 +208,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -274,10 +275,10 @@ def __init__(self, def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -347,7 +348,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -368,8 +369,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -395,7 +396,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/qwen3_5.py b/lmdeploy/pytorch/models/qwen3_5.py index 59f373f075..8930acf978 100644 --- a/lmdeploy/pytorch/models/qwen3_5.py +++ b/lmdeploy/pytorch/models/qwen3_5.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. +from collections.abc import Iterable from functools import lru_cache -from typing import Any, Iterable, List, Tuple +from typing import Any import numpy as np import torch @@ -16,8 +17,13 @@ from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, LayerNorm, RMSNorm, SiluAndMul from lmdeploy.pytorch.nn.gated_delta import CausalConv1d, GatedDelta, GatedDeltaMeta, build_rmsnorm_gated -from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_o_proj, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_colwise_linear, + build_merged_colwise_linear, + build_o_proj, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters from lmdeploy.pytorch.weight_loader.model_weight_loader import default_weight_loader, load_weight @@ -243,7 +249,7 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: return rotary_pos_emb # copy from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen3_vl.py#L474 - def fast_pos_embed_interpolate(self, grid_thw: List[List[int]]) -> torch.Tensor: + def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor: num_grid_per_side = self.num_grid_per_side m_size = self.spatial_merge_size hidden_dim = self.pos_embed.embedding_dim @@ -486,14 +492,14 @@ def fix_zba_ordering(self, mixed_zba: torch.Tensor): z = z.unflatten(-1, (-1, self.head_v_dim)) return z, b, a - def _load_state(self, past_key_value: Tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta): + def _load_state(self, past_key_value: tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta): """Load states from cache.""" return gated_delta_util.load_state(past_key_value=past_key_value, gated_delta_meta=gated_delta_meta) def forward( self, hidden_states: torch.Tensor, - past_key_value: Tuple[torch.Tensor, torch.Tensor], + past_key_value: tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta, ): """forward.""" @@ -629,8 +635,8 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Tuple[torch.Tensor, torch.Tensor], + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor, torch.Tensor], attn_metadata: Any, ): """Rewrite of LlamaAttention.forward.""" @@ -727,8 +733,8 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: List[torch.FloatTensor], + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor], residual: torch.Tensor | None, attn_metadata: Any, gated_delta_meta: GatedDeltaMeta, @@ -896,7 +902,7 @@ def forward( self, input_ids: torch.LongTensor, position_ids: torch.LongTensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any, state_ids: torch.Tensor, inputs_embeds: torch.Tensor | None = None, @@ -970,7 +976,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any, state_ids: torch.Tensor, inputs_embeds: torch.Tensor | None = None, @@ -1068,7 +1074,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any, state_ids: torch.Tensor, inputs_embeds: torch.Tensor | None = None, @@ -1115,7 +1121,7 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], inputs_embeds: torch.Tensor | None = None, context: StepContext | None = None, ): @@ -1188,7 +1194,7 @@ def prepare_inputs_for_generation( pos_embeds=pos_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" def __skip_layers(name): @@ -1374,7 +1380,7 @@ def _update_model_meta_prefilling(self, context: StepContext): return new_model_metas - def update_model_metas(self, past_key_values: List[List[torch.Tensor]], inputs_embeds: torch.Tensor | None, + def update_model_metas(self, past_key_values: list[list[torch.Tensor]], inputs_embeds: torch.Tensor | None, context: StepContext): """Update model meta.""" if context.is_decoding: diff --git a/lmdeploy/pytorch/models/qwen3_5_moe.py b/lmdeploy/pytorch/models/qwen3_5_moe.py index c475b873a5..3dca50cdc8 100644 --- a/lmdeploy/pytorch/models/qwen3_5_moe.py +++ b/lmdeploy/pytorch/models/qwen3_5_moe.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, Iterable, List, Tuple +from collections.abc import Iterable import torch import torch.distributed as dist @@ -16,8 +16,16 @@ from .patch import add_prefix, get_build_model_context from .qwen2_5_vl import Qwen2_5_VLInputProcessor as Qwen3_5MoeInputProcessor -from .qwen3_5 import (Qwen3_5Attention, Qwen3_5DecoderLayer, Qwen3_5ForConditionalGeneration, Qwen3_5GatedDeltaNet, - Qwen3_5MLP, Qwen3_5Model, Qwen3_5TextModel, Qwen3_5TextRotaryEmbedding) +from .qwen3_5 import ( + Qwen3_5Attention, + Qwen3_5DecoderLayer, + Qwen3_5ForConditionalGeneration, + Qwen3_5GatedDeltaNet, + Qwen3_5MLP, + Qwen3_5Model, + Qwen3_5TextModel, + Qwen3_5TextRotaryEmbedding, +) from .qwen3_5 import Qwen3_5VisionModel as Qwen3_5MoeVisionModel @@ -265,8 +273,8 @@ def __init__(self, bm_ctx = get_build_model_context() self.enable_return_routed_experts = bm_ctx.enable_return_routed_experts - def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - expert_params_mapping: List): + def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + expert_params_mapping: list): """Load weight experts.""" # this func is not used, but it has same layout with tranformers implementation # so I will keep it for now. @@ -282,7 +290,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di param = params_dict[name] load_weight(param, loaded_weight) - def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter]): + def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter]): """Load weight of fused expert weights.""" num_experts = self.config.text_config.num_experts fused_gateup_name = 'gate_up_proj' @@ -305,7 +313,7 @@ def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, par w2 = loaded_weight[expert_id] load_weight(param, w2, expert_id=expert_id, shard_id='down') - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" def __skip_layers(name): diff --git a/lmdeploy/pytorch/models/qwen3_moe.py b/lmdeploy/pytorch/models/qwen3_moe.py index 5157199175..158cc5ecd3 100644 --- a/lmdeploy/pytorch/models/qwen3_moe.py +++ b/lmdeploy/pytorch/models/qwen3_moe.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -94,8 +95,8 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -317,9 +318,9 @@ def __init__( def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, all_routed_experts: torch.Tensor = None, ): @@ -396,10 +397,10 @@ def __init__(self, def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, all_routed_experts: torch.Tensor = None, ): """Rewrite of LlamaModel.forward.""" @@ -483,7 +484,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -518,8 +519,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -545,8 +546,8 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - expert_params_mapping: List): + def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + expert_params_mapping: list): """Load weight experts.""" # load fused weights if any([k in name for k in ['fused_w1w3', 'fused_w2']]): @@ -563,7 +564,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di param = params_dict[name] load_weight(param, loaded_weight) - def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter]): + def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter]): """Load weight of fused expert weights.""" num_experts = self.config.num_experts fused_gateup_name = 'fused_w1w3' @@ -588,7 +589,7 @@ def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, par w2 = loaded_weight.narrow(dim=0, start=chunk_size * expert_id, length=chunk_size) load_weight(param, w2, expert_id=expert_id, shard_id='down') - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/qwen3_next.py b/lmdeploy/pytorch/models/qwen3_next.py index 4c56c01aa3..3f70dc5cf9 100644 --- a/lmdeploy/pytorch/models/qwen3_next.py +++ b/lmdeploy/pytorch/models/qwen3_next.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch import torch.nn.functional as F @@ -13,8 +14,13 @@ from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config from lmdeploy.pytorch.nn.gated_delta import CausalConv1d, GatedDelta, GatedDeltaMeta, build_rmsnorm_gated -from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_o_proj, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_colwise_linear, + build_merged_colwise_linear, + build_o_proj, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.nn.moe import SoftmaxTopK, build_fused_moe from lmdeploy.pytorch.weight_loader.model_weight_loader import default_weight_loader, load_weight @@ -142,14 +148,14 @@ def fix_query_key_value_ordering(self, mixed_qkvz: torch.Tensor, mixed_ba: torch a = a.float().flatten(-2, -1) return mixed_qkv, z, b, a - def _load_state(self, past_key_value: Tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta): + def _load_state(self, past_key_value: tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta): """Load states from cache.""" return gated_delta_util.load_state(past_key_value=past_key_value, gated_delta_meta=gated_delta_meta) def forward( self, hidden_states: torch.Tensor, - past_key_value: Tuple[torch.Tensor, torch.Tensor], + past_key_value: tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta, ): """forward.""" @@ -268,8 +274,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -487,9 +493,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor], + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None, attn_metadata: Any, gated_delta_meta: GatedDeltaMeta, ): @@ -556,10 +562,10 @@ def forward( self, input_ids: torch.LongTensor, position_ids: torch.LongTensor, - past_key_values: List[torch.FloatTensor], + past_key_values: list[torch.FloatTensor], attn_metadata: Any, state_ids: torch.Tensor, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -632,7 +638,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, state_ids: torch.Tensor = None, @@ -655,8 +661,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -717,8 +723,8 @@ def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta, **kwargs): return new_inputs - def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - expert_params_mapping: List): + def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + expert_params_mapping: list): """Load weight experts.""" # load fused weights for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping: @@ -732,7 +738,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di param = params_dict[name] load_weight(param, loaded_weight) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" def __skip_layers(name): diff --git a/lmdeploy/pytorch/models/qwen3_vl.py b/lmdeploy/pytorch/models/qwen3_vl.py index a6f694c6f2..227c1c71fe 100644 --- a/lmdeploy/pytorch/models/qwen3_vl.py +++ b/lmdeploy/pytorch/models/qwen3_vl.py @@ -1,7 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. +from collections.abc import Iterable from functools import lru_cache -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any import numpy as np import torch @@ -116,14 +117,14 @@ def __init__(self, def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, mrope_position_ids: torch.LongTensor = None, # args for deepstack - visual_pos_masks: Optional[torch.Tensor] = None, - deepstack_visual_embeds: Optional[list[torch.Tensor]] = None, + visual_pos_masks: torch.Tensor | None = None, + deepstack_visual_embeds: list[torch.Tensor] | None = None, ): """visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): @@ -279,7 +280,7 @@ def __init__( def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: Optional[torch.Tensor] = None) -> torch.Tensor: + rotary_pos_emb: torch.Tensor | None = None) -> torch.Tensor: hidden_states = hidden_states + self.attn( self.norm1(hidden_states), cu_seqlens=cu_seqlens, @@ -419,7 +420,7 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: return rotary_pos_emb # copy from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen3_vl.py#L474 - def fast_pos_embed_interpolate(self, grid_thw: List[List[int]]) -> torch.Tensor: + def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor: num_grid_per_side = self.num_grid_per_side m_size = self.spatial_merge_size hidden_dim = self.pos_embed.embedding_dim @@ -549,7 +550,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, mrope_position_ids: torch.Tensor = None, @@ -609,8 +610,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -680,7 +681,7 @@ def rename_weight(cls, name: str) -> str: return name[len('model.'):] return name - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ @@ -827,8 +828,8 @@ def _update_model_meta_prefilling(self, context: StepContext): return new_model_metas def update_model_metas(self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None): """Update model meta.""" if context.is_decoding: @@ -841,4 +842,4 @@ def get_input_processor(self) -> BaseModelInputProcessor: return self.input_processor -InputMultiModalType = List[Dict[str, Any]] +InputMultiModalType = list[dict[str, Any]] diff --git a/lmdeploy/pytorch/models/qwen3_vl_moe.py b/lmdeploy/pytorch/models/qwen3_vl_moe.py index 5810ab9b11..f6c0410840 100644 --- a/lmdeploy/pytorch/models/qwen3_vl_moe.py +++ b/lmdeploy/pytorch/models/qwen3_vl_moe.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -35,14 +36,14 @@ def __init__(self, def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, mrope_position_ids: torch.LongTensor = None, # args for deepstack - visual_pos_masks: Optional[torch.Tensor] = None, - deepstack_visual_embeds: Optional[list[torch.Tensor]] = None, + visual_pos_masks: torch.Tensor | None = None, + deepstack_visual_embeds: list[torch.Tensor] | None = None, ): """visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*): @@ -136,8 +137,8 @@ def __init__( device=device, prefix=add_prefix('language_model', prefix)) - def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - expert_params_mapping: List): + def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + expert_params_mapping: list): """Load weight experts.""" for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping: @@ -152,8 +153,8 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di load_weight(param, loaded_weight) # modify from vllm qwen3vlmoe fused expert loading - def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - fused_expert_params_mapping: List): + def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + fused_expert_params_mapping: list): """Load weight of fused expert weights.""" num_experts = self.config.text_config.num_experts @@ -176,7 +177,7 @@ def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, par for expert_id in range(num_experts): load_weight(param, w2[expert_id], expert_id=expert_id, shard_id='down') - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/sdar.py b/lmdeploy/pytorch/models/sdar.py index 6a624e40e4..7ed9016cae 100644 --- a/lmdeploy/pytorch/models/sdar.py +++ b/lmdeploy/pytorch/models/sdar.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -8,8 +9,13 @@ from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config -from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_down_linear, + build_gateup_linear, + build_o_proj, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight from .utils.cudagraph import CudaGraphMixin @@ -68,8 +74,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -181,9 +187,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -238,10 +244,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of forward.""" @@ -315,7 +321,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -345,8 +351,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -372,7 +378,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/sdar_moe.py b/lmdeploy/pytorch/models/sdar_moe.py index 3573ea9500..a52fb42de9 100644 --- a/lmdeploy/pytorch/models/sdar_moe.py +++ b/lmdeploy/pytorch/models/sdar_moe.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -8,8 +9,13 @@ from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config -from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj, - build_rowwise_linear) +from lmdeploy.pytorch.nn.linear import ( + build_down_linear, + build_gateup_linear, + build_o_proj, + build_qkv_proj, + build_rowwise_linear, +) from lmdeploy.pytorch.nn.moe import SoftmaxTopK, build_fused_moe from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight @@ -69,8 +75,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -250,9 +256,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): @@ -307,10 +313,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of forward.""" @@ -384,7 +390,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -414,8 +420,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -441,8 +447,8 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter], - expert_params_mapping: List): + def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter], + expert_params_mapping: list): """Load weight experts.""" # load fused weights for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping: @@ -456,7 +462,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di param = params_dict[name] load_weight(param, loaded_weight) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/siglip.py b/lmdeploy/pytorch/models/siglip.py index 274a6a10bb..2444b3f0e0 100644 --- a/lmdeploy/pytorch/models/siglip.py +++ b/lmdeploy/pytorch/models/siglip.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import math -from typing import Iterable, Set, Tuple, Union +from collections.abc import Iterable import torch from torch import nn @@ -206,7 +206,7 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, None]: + ) -> tuple[torch.Tensor, None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) hidden_states, _ = self.self_attn(hidden_states=hidden_states) @@ -242,7 +242,7 @@ def forward( self, inputs_embeds: torch.Tensor, **kwargs, - ) -> Union[torch.Tensor, list[torch.Tensor]]: + ) -> torch.Tensor | list[torch.Tensor]: hidden_states = inputs_embeds for encoder_layer in self.layers: @@ -357,7 +357,7 @@ def forward( interpolate_pos_encoding=interpolate_pos_encoding, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ('qkv_proj', 'q_proj', 'q'), @@ -365,7 +365,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: ('qkv_proj', 'v_proj', 'v'), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: diff --git a/lmdeploy/pytorch/models/starcoder2.py b/lmdeploy/pytorch/models/starcoder2.py index 509910a3e9..43c7c9bc6f 100644 --- a/lmdeploy/pytorch/models/starcoder2.py +++ b/lmdeploy/pytorch/models/starcoder2.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Iterable, List, Optional, Tuple +from collections.abc import Iterable +from typing import Any import torch from torch import nn @@ -62,8 +63,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[Tuple[torch.Tensor]] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: tuple[torch.Tensor] | None = None, attn_metadata: Any = None, ): """Rewrite of LlamaAttention.forward.""" @@ -171,9 +172,9 @@ def __init__(self, def forward( self, hidden_states: torch.Tensor, - rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor], - past_key_value: Optional[List[torch.FloatTensor]], - residual: Optional[torch.Tensor] = None, + rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor], + past_key_value: list[torch.FloatTensor] | None, + residual: torch.Tensor | None = None, attn_metadata: Any = None, ): if residual is None: @@ -227,10 +228,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device: def forward( self, input_ids: torch.LongTensor = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: torch.LongTensor | None = None, + past_key_values: list[torch.FloatTensor] | None = None, attn_metadata: Any = None, - inputs_embeds: Optional[torch.FloatTensor] = None, + inputs_embeds: torch.FloatTensor | None = None, ): """Rewrite of LlamaModel.forward.""" @@ -299,7 +300,7 @@ def forward( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -328,8 +329,8 @@ def get_input_embeddings(self): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" @@ -355,7 +356,7 @@ def prepare_inputs_for_generation( inputs_embeds=inputs_embeds, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" # modify from vllm stacked_params_mapping = [ diff --git a/lmdeploy/pytorch/models/utils/cudagraph.py b/lmdeploy/pytorch/models/utils/cudagraph.py index 2b5a4dc8ad..7b93b9f633 100644 --- a/lmdeploy/pytorch/models/utils/cudagraph.py +++ b/lmdeploy/pytorch/models/utils/cudagraph.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from dataclasses import dataclass -from typing import Any, Dict, List, Optional +from typing import Any import torch from torch import Tensor @@ -8,7 +8,7 @@ from lmdeploy.pytorch.model_inputs import StepContext, get_step_ctx_manager -BuffType = Dict[str, Tensor] +BuffType = dict[str, Tensor] def _get_meta_flashattn( @@ -21,9 +21,9 @@ def _get_meta_flashattn( cache_seqlens: torch.Tensor, qkv_dtype=torch.bfloat16, headdim_v=None, - cu_seqlens_q: Optional[torch.Tensor] = None, - cu_seqlens_k_new: Optional[torch.Tensor] = None, - page_size: Optional[int] = None, + cu_seqlens_q: torch.Tensor | None = None, + cu_seqlens_k_new: torch.Tensor | None = None, + page_size: int | None = None, causal=True, window_size=(-1, -1), # -1 means infinite context window num_splits=0, @@ -77,7 +77,7 @@ class CudaGraphMeta: vocab_size: int = 1 use_mla_fp8_cache: bool = False use_flash_mla: bool = False - mla_index_topk: Optional[int] = None + mla_index_topk: int | None = None decode_query_len: int = 1 use_fa3_decoding: bool = False @@ -89,7 +89,7 @@ def support_cuda_graph( self, input_ids: torch.Tensor, position_ids: torch.Tensor, - past_key_values: List[List[torch.Tensor]], + past_key_values: list[list[torch.Tensor]], attn_metadata: Any = None, inputs_embeds: torch.Tensor = None, **kwargs, @@ -102,7 +102,7 @@ def make_output_buffers(self, output): if isinstance(output, torch.Tensor): output_buffers = dict(hidden_states=output) else: - assert isinstance(output, Dict) + assert isinstance(output, dict) output_buffers = output return output_buffers @@ -138,7 +138,7 @@ def update_meta_flashattn(self, graph_meta: CudaGraphMeta, block_size: int, max_ ) return scheduler_metadata - def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, *args, past_key_values: List, **kwargs) -> BuffType: + def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, *args, past_key_values: list, **kwargs) -> BuffType: """Make cudagraph buffers from forward inputs.""" max_batches = graph_meta.max_batchs max_tokens = graph_meta.max_tokens @@ -194,8 +194,8 @@ def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, *args, past_key_valu @record_function('fill_buffers_cudagraph') def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta, input_ids: Tensor, position_ids: Tensor, - past_key_values: List, attn_metadata: Any, inputs_embeds: Tensor, - **kwargs) -> Dict[str, Tensor]: + past_key_values: list, attn_metadata: Any, inputs_embeds: Tensor, + **kwargs) -> dict[str, Tensor]: """Fill cudagraph buffers from forward inputs.""" block_offsets: Tensor = attn_metadata.block_offsets @@ -293,7 +293,7 @@ def update_context_cudagraph(self, graph_meta: CudaGraphMeta, context: StepConte context.kv_seqlens = input_buffers['kv_seqlens'] context.q_start_loc = input_buffers['q_start_loc'] - def get_outputs_cudagraph(self, output_buffers: Dict[str, torch.Tensor], input_ids: Tensor, **kwargs): + def get_outputs_cudagraph(self, output_buffers: dict[str, torch.Tensor], input_ids: Tensor, **kwargs): """Get outputs from buffers.""" num_tokens = input_ids.size(-1) outputs = dict() diff --git a/lmdeploy/pytorch/models/utils/model.py b/lmdeploy/pytorch/models/utils/model.py index 7c63f3fea8..fe4a47802f 100644 --- a/lmdeploy/pytorch/models/utils/model.py +++ b/lmdeploy/pytorch/models/utils/model.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import functools -from typing import Iterable, List, Optional, Tuple +from collections.abc import Iterable import torch @@ -36,14 +36,14 @@ def forward(self, *args, **kwargs): def prepare_inputs_for_generation( self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None, ): """Prepare input.""" raise NotImplementedError('Not Implemented') - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): """Load weights.""" raise NotImplementedError('Not Implemented') @@ -61,8 +61,8 @@ def update_weights(self): pass def update_model_metas(self, - past_key_values: List[List[torch.Tensor]], - inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: list[list[torch.Tensor]], + inputs_embeds: torch.Tensor | None = None, context: StepContext = None): """Update model meta.""" return None @@ -132,8 +132,8 @@ def build_lm_head(self, hidden_size: int, vocab_size: int, bias: bool = False, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, **kwargs): """Build LM Head.""" bm_ctx = get_build_model_context() diff --git a/lmdeploy/pytorch/models/utils/multimodal.py b/lmdeploy/pytorch/models/utils/multimodal.py index 699f88021f..34a7e0de7e 100644 --- a/lmdeploy/pytorch/models/utils/multimodal.py +++ b/lmdeploy/pytorch/models/utils/multimodal.py @@ -1,9 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Tuple from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs -PreparedInputs = Tuple[List[int], MultiModalInputs] +PreparedInputs = tuple[list[int], MultiModalInputs] class MultiModalMixin: diff --git a/lmdeploy/pytorch/multimodal/data_type.py b/lmdeploy/pytorch/multimodal/data_type.py index dd3ec9a37d..c379984658 100644 --- a/lmdeploy/pytorch/multimodal/data_type.py +++ b/lmdeploy/pytorch/multimodal/data_type.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from dataclasses import dataclass, fields -from typing import Any, Dict, List, Union +from typing import Any from torch import Tensor @@ -9,9 +9,9 @@ class MultiModalData: pass -MultiModalDataList = List[MultiModalData] +MultiModalDataList = list[MultiModalData] -NestedTensor = Union[Tensor, List[Tensor]] +NestedTensor = Tensor | list[Tensor] @dataclass @@ -20,7 +20,7 @@ class MultiModalTensor: start: int end: int = None encoder_len: int = None - meta: Dict[str, Any] = None + meta: dict[str, Any] = None def __post_init__(self): if self.end is None: @@ -56,4 +56,4 @@ def to_device(self, device: str, non_blocking: bool = False): return MultiModalTensor(**out_dict) -MultiModalInputs = Dict[str, List[MultiModalTensor]] +MultiModalInputs = dict[str, list[MultiModalTensor]] diff --git a/lmdeploy/pytorch/multimodal/image_type.py b/lmdeploy/pytorch/multimodal/image_type.py index 19211a381f..0d9664d2b9 100644 --- a/lmdeploy/pytorch/multimodal/image_type.py +++ b/lmdeploy/pytorch/multimodal/image_type.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from dataclasses import dataclass -from typing import Any, ClassVar, Dict +from typing import Any, ClassVar from PIL import Image @@ -11,5 +11,5 @@ class ImageData(MultiModalData): data: Image loc: int - meta: Dict[str, Any] = None + meta: dict[str, Any] = None type: ClassVar[str] = 'image' diff --git a/lmdeploy/pytorch/nn/__init__.py b/lmdeploy/pytorch/nn/__init__.py index 2c89fac7c4..bc40334f35 100644 --- a/lmdeploy/pytorch/nn/__init__.py +++ b/lmdeploy/pytorch/nn/__init__.py @@ -5,9 +5,11 @@ from .attention import Attention, FlashAttention # noqa: F401 from .embedding import ParallelEmbedding # noqa: F401 from .norm import LayerNorm, RMSNorm # noqa: F401 -from .rotary_embedding import ApplyRotaryEmb # noqa: F401 -from .rotary_embedding import RopeType # noqa: F401 -from .rotary_embedding import YarnParameters # noqa: F401 -from .rotary_embedding import build_rotary_embedding # noqa: F401 -from .rotary_embedding import build_rotary_embedding_from_config # noqa: F401 -from .rotary_embedding import build_rotary_params # noqa: F401 +from .rotary_embedding import ( + ApplyRotaryEmb, # noqa: F401 + RopeType, # noqa: F401 + YarnParameters, # noqa: F401 + build_rotary_embedding, # noqa: F401 + build_rotary_embedding_from_config, # noqa: F401 + build_rotary_params, # noqa: F401 +) diff --git a/lmdeploy/pytorch/nn/gated_delta.py b/lmdeploy/pytorch/nn/gated_delta.py index c61dcab6b5..e15be9c89f 100644 --- a/lmdeploy/pytorch/nn/gated_delta.py +++ b/lmdeploy/pytorch/nn/gated_delta.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Sequence, Tuple +from collections.abc import Sequence +from typing import Any import torch from torch import nn @@ -198,7 +199,7 @@ def __init__( self, in_channels: int, out_channels: int, - kernel_size: int | Tuple[int], + kernel_size: int | tuple[int], split: Sequence[int], groups: int = 1, bias: bool = True, @@ -232,7 +233,7 @@ def __init__( def make_weight( in_channels: int, out_channels: int, - kernel_size: int | Tuple[int], + kernel_size: int | tuple[int], groups: int = 1, bias: bool = True, device: str | torch.device | None = None, @@ -273,6 +274,6 @@ def forward(self, x: torch.Tensor, conv_state: torch.Tensor, gated_delta_meta: G @record_function('gated_delta_load_state') -def load_state(past_key_value: Tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta): +def load_state(past_key_value: tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta): """Load states from cache.""" return past_key_value[:2] diff --git a/lmdeploy/pytorch/nn/linear/__init__.py b/lmdeploy/pytorch/nn/linear/__init__.py index 7fda2087bf..3dd3df995b 100644 --- a/lmdeploy/pytorch/nn/linear/__init__.py +++ b/lmdeploy/pytorch/nn/linear/__init__.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, List, Optional +from typing import Any import torch from torch import nn @@ -19,11 +19,11 @@ def build_linear( in_features: int, out_features: int, bias: bool, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, colwise: bool = True, is_tp: bool = False, - quant_config: Dict = None, + quant_config: dict = None, all_reduce: bool = True, tp_align_size: int = 1, dp_gather: bool = False, @@ -104,11 +104,11 @@ def build_colwise_linear( in_features: int, out_features: int, bias: bool, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, is_tp: bool = False, tp_align_size: int = 1, - quant_config: Dict = None, + quant_config: dict = None, dp_disable_tp: bool = False, dp_gather: bool = False, check_dist: bool = True, @@ -148,11 +148,11 @@ def build_rowwise_linear( in_features: int, out_features: int, bias: bool, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, is_tp: bool = False, tp_align_size: int = 1, - quant_config: Dict = None, + quant_config: dict = None, all_reduce: bool = True, dp_disable_tp: bool = False, check_dist: bool = True, @@ -183,13 +183,13 @@ def build_rowwise_linear( def build_merged_colwise_linear( in_features: int, - all_out_features: List[int], + all_out_features: list[int], bias: bool, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, - quant_config: Dict = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + quant_config: dict = None, is_tp: bool = True, - out_names: List[Any] = None, + out_names: list[Any] = None, dp_gather: bool = False, check_dist: bool = True, layer_type: str = 'attn', @@ -261,9 +261,9 @@ def build_qkv_proj(in_features: int, head_size: int, head_size_v: int = None, bias: bool = False, - quant_config: Dict = None, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + quant_config: dict = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, is_tp: bool = True, num_replicate_kv_heads: int = 1, prefix: str = ''): @@ -335,11 +335,11 @@ def build_o_proj( in_features: int, out_features: int, bias: bool, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, is_tp: bool = False, tp_align_size: int = 1, - quant_config: Dict = None, + quant_config: dict = None, all_reduce: bool = True, prefix: str = '', ) -> nn.Module: @@ -365,13 +365,13 @@ def build_o_proj( def build_gateup_linear( in_features: int, - all_out_features: List[int], + all_out_features: list[int], bias: bool, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, - quant_config: Dict = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, + quant_config: dict = None, is_tp: bool = True, - out_names: List[Any] = None, + out_names: list[Any] = None, dp_gather: bool = True, prefix: str = '', ): @@ -401,11 +401,11 @@ def build_down_linear( in_features: int, out_features: int, bias: bool, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, is_tp: bool = False, tp_align_size: int = 1, - quant_config: Dict = None, + quant_config: dict = None, all_reduce: bool = True, prefix: str = '', ) -> nn.Module: diff --git a/lmdeploy/pytorch/nn/linear/awq.py b/lmdeploy/pytorch/nn/linear/awq.py index 5e24d93db7..aa6303ce46 100644 --- a/lmdeploy/pytorch/nn/linear/awq.py +++ b/lmdeploy/pytorch/nn/linear/awq.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, List, Optional +from typing import Any import torch @@ -21,7 +21,7 @@ def __init__( w_bit: int, group_size: int, bias: bool, - device: Optional[torch.device] = None, + device: torch.device | None = None, colwise: bool = True, is_tp: bool = False, all_reduce: bool = True, @@ -68,7 +68,7 @@ def register_all_parameters(self, qweight: torch.Tensor, scales: torch.Tensor, qzeros: torch.Tensor, - bias: Optional[torch.Tensor] = None): + bias: torch.Tensor | None = None): """Register all parameters.""" qweight = torch.nn.Parameter(qweight, requires_grad=False) scales = torch.nn.Parameter(scales, requires_grad=False) @@ -173,13 +173,13 @@ class MergedAwqLinear(AwqLinear): def __init__(self, in_features: int, - all_out_features: List[int], + all_out_features: list[int], w_bit: int, group_size: int, bias: bool, - device: Optional[torch.device] = None, + device: torch.device | None = None, is_tp: bool = True, - out_names: Optional[List[int]] = None, + out_names: list[int] | None = None, layer_type: str = 'attn'): self.init_tp_args(is_tp, all_reduce=False, colwise=True, layer_type=layer_type) @@ -225,7 +225,7 @@ def _get_io_features(self, in_features: int, out_features: int, w_bit: int, grou """Get io features.""" return in_features, out_features - def _update_all_out_features(self, all_out_features: List[int], w_bit: int, group_size: int): + def _update_all_out_features(self, all_out_features: list[int], w_bit: int, group_size: int): """Update all out features.""" world_size, rank = self.get_tp_world_rank() new_all_out_features = [] @@ -280,7 +280,7 @@ def __init__(self, w_bit: int, group_size: int, bias: bool = False, - device: Optional[torch.device] = None, + device: torch.device | None = None, is_tp: bool = True, num_replicate_kv_heads: int = 1): self.init_tp_args(is_tp, all_reduce=False, colwise=True, layer_type='attn') @@ -309,7 +309,7 @@ def __init__(self, out_names=out_names, layer_type='attn') - def _update_all_out_features(self, all_out_features: List[int], w_bit: int, group_size: int): + def _update_all_out_features(self, all_out_features: list[int], w_bit: int, group_size: int): """Update all out features.""" return all_out_features diff --git a/lmdeploy/pytorch/nn/linear/base.py b/lmdeploy/pytorch/nn/linear/base.py index 53bd6f6083..e1bff38cf9 100644 --- a/lmdeploy/pytorch/nn/linear/base.py +++ b/lmdeploy/pytorch/nn/linear/base.py @@ -1,13 +1,18 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Callable, List, Optional +from collections.abc import Callable import torch import torch.distributed as dist from torch import nn from lmdeploy.pytorch.config import TPMode -from lmdeploy.pytorch.distributed import (gather_by_tp_sizes, get_dist_group, get_dist_manager, get_tp_world_rank, - reduce_scatter_by_tp_sizes) +from lmdeploy.pytorch.distributed import ( + gather_by_tp_sizes, + get_dist_group, + get_dist_manager, + get_tp_world_rank, + reduce_scatter_by_tp_sizes, +) from lmdeploy.pytorch.model_inputs import get_step_ctx_manager from .utils import update_tp_args @@ -30,12 +35,12 @@ def __init__(self, gemm_func: Callable, max_tokens_per_round: int = 8192): self.tp_group = tp_group.gpu_group self.max_tokens_per_round = max_tokens_per_round * self.attn_tp // self.tp // 2 - def all_gather(self, hidden_states: torch.Tensor, tp_sizes: List[int]): + def all_gather(self, hidden_states: torch.Tensor, tp_sizes: list[int]): """All gather.""" hidden_states, handle = dist.gather_by_tp_sizes(hidden_states, tp_sizes, group=self.gather_group, async_op=True) return hidden_states, handle - def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor, tp_sizes: List[int]): + def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor, tp_sizes: list[int]): """Reduce scatter.""" hidden_states_list = list(hidden_states.split(tp_sizes, -2)) cur_out_states = hidden_states_list[self.gather_rank] @@ -45,7 +50,7 @@ def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor, handle = dist.reduce_scatter(out_states, hidden_states_list, group=self.tp_group, async_op=True) return out_states, handle - def _gemm_and_reduce_scatter(self, hidden_states: torch.Tensor, output_states: torch.Tensor, tp_sizes: List[int], + def _gemm_and_reduce_scatter(self, hidden_states: torch.Tensor, output_states: torch.Tensor, tp_sizes: list[int], handle: dist.Work): """Gemm and reduce scatter.""" handle.wait() @@ -108,8 +113,8 @@ class LinearBase(nn.Module): def __init__( self, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, colwise: bool = True, is_tp: bool = False, all_reduce: bool = True, @@ -177,11 +182,11 @@ def update_weights(self): """Update weights.""" raise NotImplementedError('This method should be implemented in subclasses.') - def _forward_default(self, x, all_reduce: bool, tp_sizes: List[int]): + def _forward_default(self, x, all_reduce: bool, tp_sizes: list[int]): """Default forward implement.""" raise NotImplementedError('This method should be implemented in subclasses.') - def _forward_lora(self, x, tp_sizes: List[int] = None): + def _forward_lora(self, x, tp_sizes: list[int] = None): """Forward with LoRA.""" out = self._forward_default(x, False, tp_sizes) diff --git a/lmdeploy/pytorch/nn/linear/blocked_fp8.py b/lmdeploy/pytorch/nn/linear/blocked_fp8.py index 04d3c03e1e..bcadeef79c 100644 --- a/lmdeploy/pytorch/nn/linear/blocked_fp8.py +++ b/lmdeploy/pytorch/nn/linear/blocked_fp8.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, List, Optional +from typing import Any import torch @@ -21,10 +21,10 @@ def __init__( in_features: int, out_features: int, bias: bool, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, fp8_dtype: torch.dtype = torch.float8_e4m3fn, - scale_fmt: Optional[str] = None, + scale_fmt: str | None = None, colwise: bool = True, is_tp: bool = False, all_reduce: bool = True, @@ -66,7 +66,7 @@ def setup_loaders(self): def register_all_parameters(self, weight: torch.Tensor, weight_scale_inv: torch.Tensor, - bias: Optional[torch.Tensor] = None): + bias: torch.Tensor | None = None): """Register all parameters.""" weight = torch.nn.Parameter(weight, requires_grad=False) weight_scale_inv = torch.nn.Parameter(weight_scale_inv, requires_grad=False) @@ -167,15 +167,15 @@ class MergedBlockedF8Linear(BlockedF8Linear): def __init__(self, in_features: int, - all_out_features: List[int], + all_out_features: list[int], bias: bool, fp8_dtype: torch.dtype = torch.float8_e4m3fn, - scale_fmt: Optional[str] = None, - replicate: Optional[List[bool]] = None, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + scale_fmt: str | None = None, + replicate: list[bool] | None = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, is_tp: bool = True, - out_names: Optional[List[int]] = None, + out_names: list[int] | None = None, dp_gather: bool = False, layer_type: str = 'attn'): self.init_tp_args(is_tp, all_reduce=False, colwise=True, layer_type=layer_type) @@ -222,7 +222,7 @@ def _get_io_features(self, in_features: int, out_features: int, colwise: bool): """Get io features.""" return in_features, out_features - def _update_all_out_features(self, all_out_features: List[int], replicate: Optional[List[bool]]): + def _update_all_out_features(self, all_out_features: list[int], replicate: list[bool] | None): """Update all out features.""" world_size, rank = self.get_tp_world_rank() new_all_out_features = [] @@ -281,9 +281,9 @@ def __init__(self, head_size_v: int, bias: bool = False, fp8_dtype: torch.dtype = torch.float8_e4m3fn, - scale_fmt: Optional[str] = None, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + scale_fmt: str | None = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, is_tp: bool = True, dp_gather: bool = False, num_replicate_kv_heads: int = 1): @@ -313,7 +313,7 @@ def __init__(self, dp_gather=dp_gather, layer_type='attn') - def _update_all_out_features(self, all_out_features: List[int], replicate: Optional[List[bool]]): + def _update_all_out_features(self, all_out_features: list[int], replicate: list[bool] | None): """Update all out features.""" return all_out_features diff --git a/lmdeploy/pytorch/nn/linear/default.py b/lmdeploy/pytorch/nn/linear/default.py index a3f8a31a2c..e17f50d76b 100644 --- a/lmdeploy/pytorch/nn/linear/default.py +++ b/lmdeploy/pytorch/nn/linear/default.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, List, Optional +from typing import Any import torch @@ -20,8 +20,8 @@ def __init__( in_features: int, out_features: int, bias: bool, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, colwise: bool = True, is_tp: bool = False, all_reduce: bool = True, @@ -53,7 +53,7 @@ def setup_loaders(self): if self.bias is not None: self.bias.weight_loader = self.weight_loader - def register_all_parameters(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None): + def register_all_parameters(self, weight: torch.Tensor, bias: torch.Tensor | None = None): """Register all parameters.""" weight = torch.nn.Parameter(weight, requires_grad=False) if bias is not None: @@ -135,12 +135,12 @@ class MergedBaseLinear(BaseLinear): def __init__(self, in_features: int, - all_out_features: List[int], + all_out_features: list[int], bias: bool, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, is_tp: bool = True, - out_names: Optional[List[int]] = None, + out_names: list[int] | None = None, dp_gather: bool = False, layer_type: str = 'attn'): self.init_tp_args(is_tp, all_reduce=False, colwise=True, layer_type=layer_type) @@ -175,7 +175,7 @@ def _get_io_features(self, in_features: int, out_features: int, colwise: bool): """Get io features.""" return in_features, out_features - def _update_all_out_features(self, all_out_features: List[int]): + def _update_all_out_features(self, all_out_features: list[int]): """Update all out features.""" world_size, rank = self.get_tp_world_rank() new_all_out_features = [] @@ -210,8 +210,8 @@ def __init__(self, head_size: int, head_size_v: int, bias: bool = False, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, is_tp: bool = True, num_replicate_kv_heads: int = 1): self.init_tp_args(is_tp, all_reduce=False, colwise=True, layer_type='attn') @@ -236,7 +236,7 @@ def __init__(self, out_names=out_names, layer_type='attn') - def _update_all_out_features(self, all_out_features: List[int]): + def _update_all_out_features(self, all_out_features: list[int]): """Update all out features.""" return all_out_features diff --git a/lmdeploy/pytorch/nn/linear/w8a8.py b/lmdeploy/pytorch/nn/linear/w8a8.py index c9105e5599..ad4ec74d73 100644 --- a/lmdeploy/pytorch/nn/linear/w8a8.py +++ b/lmdeploy/pytorch/nn/linear/w8a8.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, List, Optional +from typing import Any import torch @@ -18,12 +18,12 @@ def __init__(self, in_features: int, out_features: int, bias: bool, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, colwise: bool = True, is_tp: bool = False, all_reduce: bool = True, - quant_dtype: Optional[torch.dtype] = torch.int8, + quant_dtype: torch.dtype | None = torch.int8, layer_type: str = 'attn'): super().__init__(dtype=torch.float16, device=device, @@ -53,7 +53,7 @@ def setup_loaders(self): if self.bias is not None: self.bias.weight_loader = self.weight_loader - def register_all_parameters(self, weight: torch.Tensor, scale: torch.Tensor, bias: Optional[torch.Tensor] = None): + def register_all_parameters(self, weight: torch.Tensor, scale: torch.Tensor, bias: torch.Tensor | None = None): """Register all parameters.""" weight = torch.nn.Parameter(weight, requires_grad=False) scale = torch.nn.Parameter(scale, requires_grad=False) @@ -131,12 +131,12 @@ class MergedW8A8Linear(W8A8Linear): def __init__(self, in_features: int, - all_out_features: List[int], + all_out_features: list[int], bias: bool, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, is_tp: bool = True, - out_names: Optional[List[int]] = None, + out_names: list[int] | None = None, quant_dtype: torch.dtype = torch.int8, layer_type: str = 'attn'): self.init_tp_args(is_tp, all_reduce=False, colwise=True, layer_type=layer_type) @@ -173,7 +173,7 @@ def _get_io_features(self, in_features: int, out_features: int, colwise: bool): """Get io features.""" return in_features, out_features - def _update_all_out_features(self, all_out_features: List[int]): + def _update_all_out_features(self, all_out_features: list[int]): """Update all out features.""" world_size, rank = self.get_tp_world_rank() new_all_out_features = [] @@ -208,8 +208,8 @@ def __init__(self, head_size: int, head_size_v: int, bias: bool = False, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, is_tp: bool = True, num_replicate_kv_heads: int = 1, quant_dtype: torch.dtype = torch.int8): @@ -236,7 +236,7 @@ def __init__(self, quant_dtype=quant_dtype, layer_type='attn') - def _update_all_out_features(self, all_out_features: List[int]): + def _update_all_out_features(self, all_out_features: list[int]): """Update all out features.""" return all_out_features diff --git a/lmdeploy/pytorch/nn/moe/__init__.py b/lmdeploy/pytorch/nn/moe/__init__.py index cb8725a581..f0e1fe103c 100644 --- a/lmdeploy/pytorch/nn/moe/__init__.py +++ b/lmdeploy/pytorch/nn/moe/__init__.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Callable, Dict, Optional +from collections.abc import Callable import torch @@ -15,11 +15,11 @@ def build_fused_moe( top_k: int, bias: bool = False, renormalize: bool = False, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, all_reduce: bool = True, enable_ep: bool = False, - quant_config: Dict = None, + quant_config: dict = None, layer_idx: int = 0, act_func: Callable = None, prefix: str = '', diff --git a/lmdeploy/pytorch/nn/moe/base.py b/lmdeploy/pytorch/nn/moe/base.py index 484dbbe492..76f1927d46 100644 --- a/lmdeploy/pytorch/nn/moe/base.py +++ b/lmdeploy/pytorch/nn/moe/base.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +from collections.abc import Callable from dataclasses import dataclass from enum import Enum, auto -from typing import Callable, Dict, List, Optional import torch import torch.nn as nn @@ -51,7 +51,7 @@ def split_size(size: int, world_size: int, align: int): return split_size -def moe_gather_inputs(hidden_states, topk_weights, topk_ids, group: Optional[dist.ProcessGroup] = None): +def moe_gather_inputs(hidden_states, topk_weights, topk_ids, group: dist.ProcessGroup | None = None): dist_config = get_dist_manager().current_config() tp = dist_config.moe_tp if tp == 1: @@ -73,7 +73,7 @@ def moe_gather_inputs(hidden_states, topk_weights, topk_ids, group: Optional[dis return hidden_states, topk_weights, topk_ids -def moe_reduce(ret, rank: int, tp_mode: TPMode, group: Optional[dist.ProcessGroup] = None): +def moe_reduce(ret, rank: int, tp_mode: TPMode, group: dist.ProcessGroup | None = None): dist_config = get_dist_manager().current_config() if dist_config.moe_tp == 1: return ret @@ -109,14 +109,14 @@ def __init__(self, gemm_func: Callable, max_tokens_per_round: int = 8192): self.max_tokens_per_round = max_tokens_per_round * self.attn_tp // self.tp def all_gather(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - tp_sizes: List[int]): + tp_sizes: list[int]): """All gather.""" hidden_states, h0 = dist.gather_by_tp_sizes(hidden_states, tp_sizes, group=self.gather_group, async_op=True) topk_weights, h1 = dist.gather_by_tp_sizes(topk_weights, tp_sizes, group=self.gather_group, async_op=True) topk_ids, h2 = dist.gather_by_tp_sizes(topk_ids, tp_sizes, group=self.gather_group, async_op=True) return hidden_states, topk_weights, topk_ids, (h0, h1, h2) - def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor, tp_sizes: List[int]): + def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor, tp_sizes: list[int]): """Reduce scatter.""" hidden_states_list = list(hidden_states.split(tp_sizes, -2)) cur_out_states = hidden_states_list[self.gather_rank] @@ -127,7 +127,7 @@ def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor, return out_states, handle def _gemm_and_reduce_scatter(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, - output_states: torch.Tensor, tp_sizes: List[int], handles: List[dist.Work]): + output_states: torch.Tensor, tp_sizes: list[int], handles: list[dist.Work]): """Gemm and reduce scatter.""" for handle in handles: handle.wait() @@ -210,7 +210,7 @@ class DispatchInputs: moe_type: MoeType = MoeType.Default @classmethod - def from_dict(cls, input: Dict): + def from_dict(cls, input: dict): """From dict.""" assert ['hidden_states', 'topk_weights', 'topk_idx'] in input moe_type = input.get('moe_type', MoeType.Default) @@ -221,7 +221,7 @@ def from_dict(cls, input: Dict): moe_type=moe_type, ) - def to_dict(self) -> Dict: + def to_dict(self) -> dict: """To dict.""" return { 'hidden_states': self.hidden_states, @@ -275,19 +275,19 @@ def before_dispatch(self, state: DispatchInputs): """Before dispatch.""" raise NotImplementedError - def dispatch(self, state: Dict): + def dispatch(self, state: dict): """dispatch.""" raise NotImplementedError - def gemm(self, state: Dict): + def gemm(self, state: dict): """gemm.""" raise NotImplementedError - def combine(self, state: Dict): + def combine(self, state: dict): """combine.""" raise NotImplementedError - def wait(self, state: Dict): + def wait(self, state: dict): """wait.""" raise NotImplementedError diff --git a/lmdeploy/pytorch/nn/moe/blocked_fp8.py b/lmdeploy/pytorch/nn/moe/blocked_fp8.py index 8880b72571..807833212e 100644 --- a/lmdeploy/pytorch/nn/moe/blocked_fp8.py +++ b/lmdeploy/pytorch/nn/moe/blocked_fp8.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Callable, Dict, List, Optional +from collections.abc import Callable import torch @@ -25,8 +25,8 @@ def __init__(self, dtype: torch.dtype, device: torch.device, bias: bool = False, - expert_list: List[int] = None, - scale_fmt: Optional[str] = None): + expert_list: list[int] = None, + scale_fmt: str | None = None): super().__init__(num_experts=num_experts, in_features=in_features, out_features=out_features, @@ -150,9 +150,9 @@ def __init__(self, bias: bool = False, renormalize: bool = False, fp8_dtype: torch.dtype = torch.float8_e4m3fn, - scale_fmt: Optional[str] = None, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + scale_fmt: str | None = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, all_reduce: bool = True, layer_idx: int = 0, act_func: Callable = None): @@ -239,7 +239,7 @@ def update_weights(self): def before_dispatch(self, state: DispatchInputs): """Before dispatch.""" - if not isinstance(state, Dict): + if not isinstance(state, dict): state = state.to_dict() moe_type = state['moe_type'] @@ -252,7 +252,7 @@ def before_dispatch(self, state: DispatchInputs): state['previous_event'] = previous_event return state - def dispatch(self, state: Dict): + def dispatch(self, state: dict): moe_type = state['moe_type'] if moe_type == MoeType.DSAsyncPrefill: fusedmoe = state['fusedmoe'] @@ -315,7 +315,7 @@ def dispatch(self, state: Dict): } return recv_state - def gemm(self, state: Dict): + def gemm(self, state: dict): moe_type = state['moe_type'] if moe_type == MoeType.DSAsyncPrefill: if (state['recv_hidden_states'][0] @@ -364,7 +364,7 @@ def gemm(self, state: Dict): gemm_state = {'hidden_states': hidden_states, 'moe_type': state['moe_type']} return gemm_state - def combine(self, state: Dict): + def combine(self, state: dict): moe_type = state['moe_type'] if moe_type == MoeType.DSAsyncPrefill: fusedmoe = state['fusedmoe'] diff --git a/lmdeploy/pytorch/nn/moe/default.py b/lmdeploy/pytorch/nn/moe/default.py index 0633aa001a..efb5f4483c 100644 --- a/lmdeploy/pytorch/nn/moe/default.py +++ b/lmdeploy/pytorch/nn/moe/default.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from collections import defaultdict -from typing import Callable, Dict, List, Optional +from collections.abc import Callable import torch from torch import nn @@ -22,7 +22,7 @@ def __init__(self, dtype: torch.dtype, device: torch.device, bias: bool = False, - expert_list: Optional[List[int]] = None): + expert_list: list[int] | None = None): super().__init__() weight = torch.empty((num_experts, out_features, in_features), dtype=dtype, device=device) weight = torch.nn.Parameter(weight, requires_grad=False) @@ -115,8 +115,8 @@ def __init__(self, top_k: int, bias: bool = False, renormalize: bool = False, - dtype: Optional[torch.dtype] = None, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + device: torch.device | None = None, all_reduce: bool = True, layer_idx: int = 0, act_func: Callable = None): @@ -188,7 +188,7 @@ def update_weights(self): def before_dispatch(self, state: DispatchInputs): """Before dispatch.""" - if not isinstance(state, Dict): + if not isinstance(state, dict): state = state.to_dict() moe_type = state['moe_type'] @@ -199,7 +199,7 @@ def before_dispatch(self, state: DispatchInputs): state['previous_event'] = previous_event return state - def dispatch(self, state: Dict): + def dispatch(self, state: dict): """dispatch.""" moe_type = state['moe_type'] if moe_type == MoeType.DSAsyncPrefill: @@ -265,7 +265,7 @@ def dispatch(self, state: Dict): raise NotImplementedError(f'Not supported moe type: {moe_type}') return recv_state - def gemm(self, state: Dict): + def gemm(self, state: dict): """gemm.""" moe_type = state['moe_type'] if moe_type == MoeType.DSAsyncPrefill: @@ -311,7 +311,7 @@ def gemm(self, state: Dict): gemm_state = {'hidden_states': hidden_states, 'moe_type': state['moe_type']} return gemm_state - def combine(self, state: Dict): + def combine(self, state: dict): """combine.""" moe_type = state['moe_type'] if moe_type == MoeType.DSAsyncPrefill: @@ -355,7 +355,7 @@ def combine(self, state: Dict): raise NotImplementedError(f'Not supported moe type: {moe_type}') return out_state - def wait(self, state: Dict): + def wait(self, state: dict): """wait.""" if state.get('event', None) is not None: state['fusedmoe'].wait(state['event']) diff --git a/lmdeploy/pytorch/nn/moe/route.py b/lmdeploy/pytorch/nn/moe/route.py index 320f483149..f71fa5a55c 100644 --- a/lmdeploy/pytorch/nn/moe/route.py +++ b/lmdeploy/pytorch/nn/moe/route.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Tuple import torch @@ -34,6 +33,6 @@ def __init__( ) def forward(self, router_logits: torch.Tensor, - e_score_correction_bias: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + e_score_correction_bias: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: """Router forward.""" return self.impl.forward(router_logits, e_score_correction_bias) diff --git a/lmdeploy/pytorch/nn/moe/w8a8.py b/lmdeploy/pytorch/nn/moe/w8a8.py index cf31b98610..62ca258bde 100644 --- a/lmdeploy/pytorch/nn/moe/w8a8.py +++ b/lmdeploy/pytorch/nn/moe/w8a8.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List, Optional import torch @@ -19,7 +18,7 @@ def __init__(self, out_features: int, weight_type: str, device: torch.device, - expert_list: List[int] = None, + expert_list: list[int] = None, quant_dtype: torch.dtype = torch.int8): super().__init__( num_experts=num_experts, @@ -75,9 +74,9 @@ def __init__(self, num_experts: int, top_k: int, renormalize: bool = False, - dtype: Optional[torch.dtype] = None, - quant_dtype: Optional[torch.dtype] = torch.int8, - device: Optional[torch.device] = None, + dtype: torch.dtype | None = None, + quant_dtype: torch.dtype | None = torch.int8, + device: torch.device | None = None, all_reduce: bool = True): device = device or torch.device('cpu') @@ -133,7 +132,7 @@ def update_weights(self): self.gate_up.update_weight(gate_up_weights, gate_up_scale) self.down.update_weight(down_weights, down_scale) - def dispatch(self, state: Dict): + def dispatch(self, state: dict): """dispatch.""" moe_type = state['moe_type'] if moe_type == MoeType.Default: @@ -151,7 +150,7 @@ def dispatch(self, state: Dict): raise NotImplementedError(f'Not supported moe type: {moe_type}') return recv_state - def gemm(self, state: Dict): + def gemm(self, state: dict): """gemm.""" hidden_states = state['hidden_states'] topk_weights = state['topk_weights'] @@ -161,7 +160,7 @@ def gemm(self, state: Dict): self.down.weight, self.down.scale, self.expert_list) return dict(hidden_states=ret, moe_type=state['moe_type']) - def combine(self, state: Dict): + def combine(self, state: dict): """combine.""" moe_type = state['moe_type'] if moe_type == MoeType.Default: @@ -175,6 +174,6 @@ def combine(self, state: Dict): raise NotImplementedError(f'Not supported moe type: {moe_type}') return out_state - def wait(self, state: Dict): + def wait(self, state: dict): """wait.""" raise NotImplementedError diff --git a/lmdeploy/pytorch/nn/norm.py b/lmdeploy/pytorch/nn/norm.py index 7e39ed4312..770493f9ab 100644 --- a/lmdeploy/pytorch/nn/norm.py +++ b/lmdeploy/pytorch/nn/norm.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict import torch from torch import nn @@ -20,7 +19,7 @@ def __init__( eps: float = 1e-6, dtype: torch.dtype | None = None, device: torch.device | None = None, - quant_config: Dict | None = None, + quant_config: dict | None = None, tp: bool = False, align: int = 1, prefix: str = '', diff --git a/lmdeploy/pytorch/nn/rotary_embedding.py b/lmdeploy/pytorch/nn/rotary_embedding.py index 740a39ed2e..3fa8cfda81 100644 --- a/lmdeploy/pytorch/nn/rotary_embedding.py +++ b/lmdeploy/pytorch/nn/rotary_embedding.py @@ -7,8 +7,13 @@ from transformers import PretrainedConfig from ..backends import OpType, get_backend -from ..backends.rotary_embedding import (FopeParameters, Llama3Parameters, LongRoPEScalingParameters, RopeType, - YarnParameters) +from ..backends.rotary_embedding import ( + FopeParameters, + Llama3Parameters, + LongRoPEScalingParameters, + RopeType, + YarnParameters, +) def get_rope_parameters(config: PretrainedConfig): diff --git a/lmdeploy/pytorch/paging/block_manager/base_block_manager.py b/lmdeploy/pytorch/paging/block_manager/base_block_manager.py index 1aeeaed031..d1f59b66a6 100644 --- a/lmdeploy/pytorch/paging/block_manager/base_block_manager.py +++ b/lmdeploy/pytorch/paging/block_manager/base_block_manager.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. import time -from typing import Dict import numpy as np @@ -213,7 +212,7 @@ def __init__(self, num_gpu_blocks: int, num_cpu_blocks: int, num_gpu_reserved: i self.allocator = LogicalAllocator(num_cpu_blocks, num_gpu_blocks, num_gpu_reserved) - self.block_tables: Dict[int, BlockTable] = {} + self.block_tables: dict[int, BlockTable] = {} @classmethod def num_required_blocks(cls, obj: SchedulerSequence, prealloc_size: int = 0): diff --git a/lmdeploy/pytorch/paging/block_trie.py b/lmdeploy/pytorch/paging/block_trie.py index 2244f3f1b5..d20aa665d2 100644 --- a/lmdeploy/pytorch/paging/block_trie.py +++ b/lmdeploy/pytorch/paging/block_trie.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import heapq from dataclasses import dataclass -from typing import Dict, Set import numpy as np @@ -33,8 +32,8 @@ def __init__(self, hash_key: int, block: int, tokens: np.ndarray, num_matched: i self.block = block self.tokens = tokens self.num_matched = num_matched - self.children: Dict[int, 'Node'] = dict() - self._parent: 'Node' = None + self.children: dict[int, Node] = dict() + self._parent: Node = None @property def parent(self): @@ -67,8 +66,8 @@ def __init__(self, cache_config: CacheConfig, block_manager: BaseBlockManager): self.enable = self.cache_config.enable_prefix_caching # caches with different adapter should not be shared. - self._roots: Dict[str, Node] = dict() - self.leaves: Set[Node] = set() + self._roots: dict[str, Node] = dict() + self.leaves: set[Node] = set() self.stats = PrefixCacheStats() def hit_rate(self): diff --git a/lmdeploy/pytorch/paging/eviction_helper/base_eviction_helper.py b/lmdeploy/pytorch/paging/eviction_helper/base_eviction_helper.py index 3799d60a42..f075748f70 100644 --- a/lmdeploy/pytorch/paging/eviction_helper/base_eviction_helper.py +++ b/lmdeploy/pytorch/paging/eviction_helper/base_eviction_helper.py @@ -1,10 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List from ...messages import SchedulerSequence from ..scheduler import Scheduler -SeqList = List[SchedulerSequence] +SeqList = list[SchedulerSequence] class BaseEvictionHelper: @@ -21,6 +20,6 @@ def need_swap_in(self, seq: SchedulerSequence): """Sequence need swap in.""" raise NotImplementedError('Not implemented.') - def evict_for_seq(self, seq: SchedulerSequence, evictable_seqs: List[SchedulerSequence], prealloc_size: int): + def evict_for_seq(self, seq: SchedulerSequence, evictable_seqs: list[SchedulerSequence], prealloc_size: int): """Evict seqs.""" raise NotImplementedError('Not implemented.') diff --git a/lmdeploy/pytorch/paging/eviction_helper/recompute_eviction_helper.py b/lmdeploy/pytorch/paging/eviction_helper/recompute_eviction_helper.py index bdded115dd..4976eddb20 100644 --- a/lmdeploy/pytorch/paging/eviction_helper/recompute_eviction_helper.py +++ b/lmdeploy/pytorch/paging/eviction_helper/recompute_eviction_helper.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List from ...messages import SchedulerSequence from ..scheduler import Scheduler @@ -17,7 +16,7 @@ def __init__(self, scheduler: Scheduler): else: self.evict_for_seq = self._evict_for_ssm - def _evict_for_seq_default(self, seq: SchedulerSequence, evictable_seqs: List[SchedulerSequence], + def _evict_for_seq_default(self, seq: SchedulerSequence, evictable_seqs: list[SchedulerSequence], prealloc_size: int): """Evict seqs.""" block_manager = self.block_manager @@ -56,7 +55,7 @@ def _evict_for_seq_default(self, seq: SchedulerSequence, evictable_seqs: List[Sc return success - def _evict_for_ssm(self, seq: SchedulerSequence, evictable_seqs: List[SchedulerSequence], prealloc_size: int): + def _evict_for_ssm(self, seq: SchedulerSequence, evictable_seqs: list[SchedulerSequence], prealloc_size: int): """Evict seqs.""" block_manager = self.block_manager state_manager = self.state_manager diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py index 9208b7cdf2..de42faca7d 100644 --- a/lmdeploy/pytorch/paging/scheduler.py +++ b/lmdeploy/pytorch/paging/scheduler.py @@ -4,7 +4,6 @@ from collections import OrderedDict from contextlib import contextmanager from dataclasses import dataclass -from typing import Dict, List from torch.profiler import record_function @@ -20,8 +19,8 @@ logger = get_logger('lmdeploy') -MapType = Dict[int, int] -SeqList = List[SchedulerSequence] +MapType = dict[int, int] +SeqList = list[SchedulerSequence] @dataclass @@ -50,10 +49,10 @@ def __init__( ) -> None: self.scheduler_config = scheduler_config self.cache_config = cache_config - self.sessions: Dict[int, SchedulerSession] = OrderedDict() + self.sessions: dict[int, SchedulerSession] = OrderedDict() # For Disaggregation - self.locked_sessions: Dict[int, SchedulerSession] = OrderedDict() + self.locked_sessions: dict[int, SchedulerSession] = OrderedDict() self.block_manager = build_block_manager(cache_config) self.block_trie = BlockTrie(self.cache_config, self.block_manager) diff --git a/lmdeploy/pytorch/ray.py b/lmdeploy/pytorch/ray.py index 6f9261f317..bb575df98d 100644 --- a/lmdeploy/pytorch/ray.py +++ b/lmdeploy/pytorch/ray.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import os import time -from typing import Dict, List import ray from ray.util.placement_group import PlacementGroup @@ -28,7 +27,7 @@ def get_device_str(device_type: str = None) -> str: return device_type -def get_resource_kwargs(device_str: str, resource_used: float = 0.01) -> Dict[str, float]: +def get_resource_kwargs(device_str: str, resource_used: float = 0.01) -> dict[str, float]: """Get resource kwargs.""" if device_str == 'GPU': resource_kwargs = {'num_gpus': resource_used} @@ -124,7 +123,7 @@ def init_ray_cluster(world_size: int, ray_address: str = None, dp: int = 1, devi 'The number of required %ss exceeds the total ' 'number of available %ss in the placement group.', device_str, device_str) # Create a new placement group - placement_group_specs: List[Dict[str, float]] = ([{device_str: 1.0} for _ in range(world_size)]) + placement_group_specs: list[dict[str, float]] = ([{device_str: 1.0} for _ in range(world_size)]) # Pin at least one bundle to the local node. # This helps multi-node DP keep each dp_rank process's workers co-located with diff --git a/lmdeploy/pytorch/spec_decode/base.py b/lmdeploy/pytorch/spec_decode/base.py index 113f6b6ead..3ecfab5f82 100644 --- a/lmdeploy/pytorch/spec_decode/base.py +++ b/lmdeploy/pytorch/spec_decode/base.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict import torch @@ -52,7 +51,7 @@ def reset_graph_runner(self): 'reset graph runner' pass - def update_main_model_outputs(self, output: Dict[str, torch.Tensor], model_inputs: ModelInputs): + def update_main_model_outputs(self, output: dict[str, torch.Tensor], model_inputs: ModelInputs): """Update outputs of main model.""" if not self.is_enabled(): hidden_states = output.pop('hidden_states') diff --git a/lmdeploy/pytorch/spec_decode/proposers/base.py b/lmdeploy/pytorch/spec_decode/proposers/base.py index aaac4e40ec..32bee78c6f 100644 --- a/lmdeploy/pytorch/spec_decode/proposers/base.py +++ b/lmdeploy/pytorch/spec_decode/proposers/base.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, List, Optional +from typing import Any import torch from mmengine import Registry @@ -23,8 +23,8 @@ def draft_model_forward( model: torch.nn.Module, inputs: ModelInputs, - model_config: Optional[ModelConfig] = None, - cache_engine: Optional[CacheEngine] = None, + model_config: ModelConfig | None = None, + cache_engine: CacheEngine | None = None, ): """Perform model forward.""" stream = torch.cuda.current_stream() @@ -86,7 +86,7 @@ def build_model(self, empty_init: bool, target_model: torch.nn.Module = None, bu self.target_model = target_model def get_outputs(self, - model_outputs: Dict[str, torch.Tensor], + model_outputs: dict[str, torch.Tensor], model_inputs: ModelInputs, extra_inputs: ExtraInputs = None): """Get outputs.""" @@ -103,7 +103,7 @@ def _forward(self, model_inputs: ModelInputs, cache_engine: CacheEngine = None): ) def update_inputs_decoding(self, model_inputs: ModelInputs, extra_inputs: ExtraInputs, next_input_ids: torch.Tensor, - target_hidden_states: torch.Tensor, model_metas: List[Any]): + target_hidden_states: torch.Tensor, model_metas: list[Any]): """Update to decoding inputs.""" model_inputs.is_decoding = True batch_size = model_inputs.seq_length.size(0) diff --git a/lmdeploy/pytorch/spec_decode/proposers/deepseek_mtp.py b/lmdeploy/pytorch/spec_decode/proposers/deepseek_mtp.py index de19beb761..09e4c08591 100644 --- a/lmdeploy/pytorch/spec_decode/proposers/deepseek_mtp.py +++ b/lmdeploy/pytorch/spec_decode/proposers/deepseek_mtp.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict import torch @@ -16,7 +15,7 @@ class DeepseekMTP(BaseSpecProposer): def get_outputs(self, - model_outputs: Dict[str, torch.Tensor], + model_outputs: dict[str, torch.Tensor], model_inputs: ModelInputs, extra_inputs: ARSpecExtraInputs = None): """Get outputs.""" diff --git a/lmdeploy/pytorch/spec_decode/proposers/eagle3.py b/lmdeploy/pytorch/spec_decode/proposers/eagle3.py index f032496f8f..db1011727e 100644 --- a/lmdeploy/pytorch/spec_decode/proposers/eagle3.py +++ b/lmdeploy/pytorch/spec_decode/proposers/eagle3.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict import torch @@ -32,7 +31,7 @@ def get_target_hidden_size(self, model_config: ModelConfig): return hidden_size * 3 def get_outputs(self, - model_outputs: Dict[str, torch.Tensor], + model_outputs: dict[str, torch.Tensor], model_inputs: ModelInputs, extra_inputs: ExtraInputs = None): """Get outputs.""" diff --git a/lmdeploy/pytorch/spec_decode/reject_sampler.py b/lmdeploy/pytorch/spec_decode/reject_sampler.py index b2c4e34946..8bccd258c6 100644 --- a/lmdeploy/pytorch/spec_decode/reject_sampler.py +++ b/lmdeploy/pytorch/spec_decode/reject_sampler.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. import enum -from typing import Optional import torch from torch import LongTensor, Tensor, nn @@ -24,7 +23,7 @@ def forward( target_logits: Tensor, draft_token_ids: LongTensor, bonus_token_ids: LongTensor, - draft_probs: Optional[Tensor] = None, + draft_probs: Tensor | None = None, ): """forward Args: @@ -49,7 +48,7 @@ def rejection_sample( draft_token_ids: LongTensor, bonus_token_ids: LongTensor, sample_policy: SamplePolicy = SamplePolicy.ALL_GREEDY, - draft_probs: Optional[Tensor] = None, + draft_probs: Tensor | None = None, ): """rejection sample Args: diff --git a/lmdeploy/pytorch/spec_decode/spec_agent.py b/lmdeploy/pytorch/spec_decode/spec_agent.py index 51739d05d5..16addc9c98 100644 --- a/lmdeploy/pytorch/spec_decode/spec_agent.py +++ b/lmdeploy/pytorch/spec_decode/spec_agent.py @@ -131,7 +131,7 @@ async def _async_forward(self, inputs: ModelInputs): """Model forward. Args: - inputs (Dict): The input data comes from _make_inputs. + inputs (dict): The input data comes from _make_inputs. """ output = self._forward_impl(inputs) await asyncio.sleep(0) @@ -142,7 +142,7 @@ async def _async_model_forward(self, inputs: ModelInputs, extra_inputs: ARSpecEx """Model forward. Args: - inputs (Dict): The input data comes from _make_inputs. + inputs (dict): The input data comes from _make_inputs. """ outputs = await self._async_forward(inputs) if inputs.is_chunk: diff --git a/lmdeploy/pytorch/strategies/ar/__init__.py b/lmdeploy/pytorch/strategies/ar/__init__.py index b593107c2e..d18cf4e952 100644 --- a/lmdeploy/pytorch/strategies/ar/__init__.py +++ b/lmdeploy/pytorch/strategies/ar/__init__.py @@ -5,12 +5,12 @@ from lmdeploy.pytorch.strategies.base.sequence import SequenceStrategy if TYPE_CHECKING: + from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig from lmdeploy.pytorch.strategies.base.cudagraph import CudagraphStrategy + from lmdeploy.pytorch.strategies.base.engine import EngineStrategy + from lmdeploy.pytorch.strategies.base.model_agent import ModelAgentStrategy from lmdeploy.pytorch.strategies.base.model_inputs import ModelInputsStrategy from lmdeploy.pytorch.strategies.base.sampling import SamplingStrategy - from lmdeploy.pytorch.strategies.base.model_agent import ModelAgentStrategy - from lmdeploy.pytorch.strategies.base.engine import EngineStrategy - from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig from ..base import StrategyFactoryBase diff --git a/lmdeploy/pytorch/strategies/ar/model_agent.py b/lmdeploy/pytorch/strategies/ar/model_agent.py index 9c7abb5887..df11ac15a7 100644 --- a/lmdeploy/pytorch/strategies/ar/model_agent.py +++ b/lmdeploy/pytorch/strategies/ar/model_agent.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from contextlib import contextmanager from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple +from typing import Any import torch import torch.distributed as dist @@ -14,7 +14,7 @@ from ..base.model_agent import ExtraInputs, ExtraOutputs, ModelAgentStrategy, StoppingCriteria -SeqList = List[SchedulerSequence] +SeqList = list[SchedulerSequence] def get_model_inputs_next_decoding(inputs: ModelInputs, input_ids: torch.Tensor, max_q_seqlen: int, @@ -74,8 +74,8 @@ def update(self, delta: ModelInputsDelta): def step(self, token_ids: torch.Tensor, stop_words: torch.Tensor, - inputs: Optional[ModelInputs] = None, - extra_inputs: Optional[ARExtraInputs] = None): + inputs: ModelInputs | None = None, + extra_inputs: ARExtraInputs | None = None): """Check whether to stop generation.""" num_appendable_ids = self.num_appendable_ids - 1 stopped = num_appendable_ids <= 0 @@ -105,7 +105,7 @@ def slice_outputs(self, inputs: torch.Tensor, seq_length: torch.LongTensor) -> t return inputs[last_idx] def slice_extra_inputs(self, extra_inputs: ARExtraInputs, model_inputs: ModelInputs, - model_outputs: Dict[str, torch.Tensor], **kwargs) -> ARExtraInputs: + model_outputs: dict[str, torch.Tensor], **kwargs) -> ARExtraInputs: """Slice outputs.""" return extra_inputs @@ -145,7 +145,7 @@ def update_prefill_for_next_step( next_token_ids: torch.Tensor, model_metas: Any, extra_outputs: ARExtraOutputs, - ) -> Tuple['ModelInputs', ARExtraInputs]: + ) -> tuple['ModelInputs', ARExtraInputs]: """Step next decoding.""" inputs = get_model_inputs_next_decoding(model_inputs, next_token_ids, max_q_seqlen=1, model_metas=model_metas) return inputs, extra_inputs diff --git a/lmdeploy/pytorch/strategies/ar/model_inputs.py b/lmdeploy/pytorch/strategies/ar/model_inputs.py index 7c1910311a..e44a73b628 100644 --- a/lmdeploy/pytorch/strategies/ar/model_inputs.py +++ b/lmdeploy/pytorch/strategies/ar/model_inputs.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Optional import numpy as np import torch @@ -93,10 +92,10 @@ def index_select(inputs: ModelInputs, indices: torch.Tensor, indice_cpu: np.ndarray = None, block_offsets: torch.Tensor = None, - max_q_seqlen: Optional[int] = None, - max_kv_seqlen: Optional[int] = None, - sum_kv_seqlen: Optional[int] = None, - num_ignored_history: Optional[torch.Tensor] = None): + max_q_seqlen: int | None = None, + max_kv_seqlen: int | None = None, + sum_kv_seqlen: int | None = None, + num_ignored_history: torch.Tensor | None = None): """Index select.""" assert inputs.is_decoding, 'Only support index_select in decoding.' diff --git a/lmdeploy/pytorch/strategies/ar/sequence.py b/lmdeploy/pytorch/strategies/ar/sequence.py index b9b277f961..affb3205d2 100644 --- a/lmdeploy/pytorch/strategies/ar/sequence.py +++ b/lmdeploy/pytorch/strategies/ar/sequence.py @@ -1,20 +1,28 @@ # Copyright (c) OpenMMLab. All rights reserved. import time from dataclasses import dataclass -from typing import Any, Dict, List, Optional +from typing import Any import numpy as np from torch import Tensor from lmdeploy.pytorch.disagg.conn.protocol import MigrationRequest from lmdeploy.pytorch.engine.model_agent import BatchedOutputs -from lmdeploy.pytorch.messages import (InputEmbeddings, MessageStatus, MultiModalInputs, SamplingParam, - SchedulerSequence, SchedulerSession, UpdateTokenMode, _to_ndarray) +from lmdeploy.pytorch.messages import ( + InputEmbeddings, + MessageStatus, + MultiModalInputs, + SamplingParam, + SchedulerSequence, + SchedulerSession, + UpdateTokenMode, + _to_ndarray, +) from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta from ..base.sequence import SequenceStrategy -SeqList = List[SchedulerSequence] +SeqList = list[SchedulerSequence] @dataclass @@ -23,8 +31,8 @@ class SchedulerSequenceDefault(SchedulerSequence): def update_token_ids(self, token_ids: Tensor, multimodals: MultiModalInputs = None, - embeddings: List[InputEmbeddings] = None, - model_meta: Dict[str, Any] = None, + embeddings: list[InputEmbeddings] = None, + model_meta: dict[str, Any] = None, mode: UpdateTokenMode = UpdateTokenMode.INPUTS, routed_experts: np.ndarray = None, **kwargs): @@ -85,7 +93,7 @@ def make_sequence(self, session: 'SchedulerSession', sampling_param: 'SamplingParam' = None, adapter_name: str = None, - migration_request: Optional[MigrationRequest] = None, + migration_request: MigrationRequest | None = None, resp_cache: bool = False, preserve_cache: bool = False) -> 'SchedulerSequence': """Make sequence.""" diff --git a/lmdeploy/pytorch/strategies/ar_spec/__init__.py b/lmdeploy/pytorch/strategies/ar_spec/__init__.py index 416d20460c..5f692e33a0 100644 --- a/lmdeploy/pytorch/strategies/ar_spec/__init__.py +++ b/lmdeploy/pytorch/strategies/ar_spec/__init__.py @@ -5,12 +5,12 @@ from lmdeploy.pytorch.strategies.base.sequence import SequenceStrategy if TYPE_CHECKING: + from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig from lmdeploy.pytorch.strategies.base.cudagraph import CudagraphStrategy + from lmdeploy.pytorch.strategies.base.engine import EngineStrategy + from lmdeploy.pytorch.strategies.base.model_agent import ModelAgentStrategy from lmdeploy.pytorch.strategies.base.model_inputs import ModelInputsStrategy from lmdeploy.pytorch.strategies.base.sampling import SamplingStrategy - from lmdeploy.pytorch.strategies.base.model_agent import ModelAgentStrategy - from lmdeploy.pytorch.strategies.base.engine import EngineStrategy - from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig from ..base import StrategyFactoryBase diff --git a/lmdeploy/pytorch/strategies/ar_spec/model_agent.py b/lmdeploy/pytorch/strategies/ar_spec/model_agent.py index eeb2e5934e..08914072cc 100644 --- a/lmdeploy/pytorch/strategies/ar_spec/model_agent.py +++ b/lmdeploy/pytorch/strategies/ar_spec/model_agent.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from contextlib import contextmanager from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple +from typing import Any import torch import torch.distributed as dist @@ -15,7 +15,7 @@ from ..ar.model_agent import ARStoppingCriteria, get_model_inputs_next_decoding from ..base.model_agent import ExtraInputs, ExtraOutputs, ModelAgentStrategy -SeqList = List[SchedulerSequence] +SeqList = list[SchedulerSequence] @dataclass @@ -84,8 +84,8 @@ def update(self, delta: ModelInputsDelta): def step(self, next_token_ids: torch.Tensor, stop_words: torch.Tensor, - inputs: Optional[ModelInputs] = None, - extra_inputs: Optional[ARSpecExtraInputs] = None): + inputs: ModelInputs | None = None, + extra_inputs: ARSpecExtraInputs | None = None): """Check whether to stop generation.""" token_ids = extra_inputs.output_token_ids @@ -128,7 +128,7 @@ def slice_outputs(self, inputs: torch.Tensor, seq_length: torch.LongTensor) -> t return inputs[last_idx] def slice_extra_inputs(self, extra_inputs: ARSpecExtraInputs, model_inputs: ModelInputs, - model_outputs: Dict[str, torch.Tensor], **kwargs) -> ARSpecExtraInputs: + model_outputs: dict[str, torch.Tensor], **kwargs) -> ARSpecExtraInputs: """Slice outputs.""" extra_inputs = ARSpecExtraInputs() extra_inputs.target_hidden_states = model_outputs.get('hidden_states') @@ -178,7 +178,7 @@ def update_prefill_for_next_step( next_token_ids: torch.Tensor, model_metas: Any, extra_outputs: ARSpecExtraOutputs, - ) -> Tuple['ModelInputs', ARSpecExtraInputs]: + ) -> tuple['ModelInputs', ARSpecExtraInputs]: """Step next decoding.""" next_token_ids = next_token_ids[:, None] next_token_ids = torch.cat([next_token_ids, extra_outputs.draft_token_ids], dim=-1) diff --git a/lmdeploy/pytorch/strategies/ar_spec/sequence.py b/lmdeploy/pytorch/strategies/ar_spec/sequence.py index 7089bce3d0..ecb095db72 100644 --- a/lmdeploy/pytorch/strategies/ar_spec/sequence.py +++ b/lmdeploy/pytorch/strategies/ar_spec/sequence.py @@ -1,20 +1,27 @@ # Copyright (c) OpenMMLab. All rights reserved. import time from dataclasses import dataclass -from typing import Any, Dict, List, Optional +from typing import Any import numpy as np from torch import Tensor from lmdeploy.pytorch.disagg.conn.protocol import MigrationRequest from lmdeploy.pytorch.engine.model_agent import BatchedOutputs -from lmdeploy.pytorch.messages import (InputEmbeddings, MessageStatus, MultiModalInputs, SamplingParam, - SchedulerSession, UpdateTokenMode, _to_ndarray) +from lmdeploy.pytorch.messages import ( + InputEmbeddings, + MessageStatus, + MultiModalInputs, + SamplingParam, + SchedulerSession, + UpdateTokenMode, + _to_ndarray, +) from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta from ..ar.sequence import ARSequenceStrategy, SchedulerSequenceDefault -SeqList = List['SchedulerSequenceARSpec'] +SeqList = list['SchedulerSequenceARSpec'] @dataclass @@ -110,8 +117,8 @@ def _update_token_ids_decode(self, token_ids: np.ndarray, draft_token_ids: np.nd def update_token_ids(self, token_ids: Tensor, multimodals: MultiModalInputs = None, - embeddings: List[InputEmbeddings] = None, - model_meta: Dict[str, Any] = None, + embeddings: list[InputEmbeddings] = None, + model_meta: dict[str, Any] = None, draft_token_ids: Tensor = None, mode: UpdateTokenMode = UpdateTokenMode.INPUTS, **kwargs): @@ -144,7 +151,7 @@ def make_sequence(self, session: 'SchedulerSession', sampling_param: 'SamplingParam' = None, adapter_name: str = None, - migration_request: Optional[MigrationRequest] = None, + migration_request: MigrationRequest | None = None, resp_cache: bool = False, preserve_cache: bool = False) -> 'SchedulerSequenceARSpec': """Make sequence.""" diff --git a/lmdeploy/pytorch/strategies/base/model_agent.py b/lmdeploy/pytorch/strategies/base/model_agent.py index 471f6e5a66..1a7796fff0 100644 --- a/lmdeploy/pytorch/strategies/base/model_agent.py +++ b/lmdeploy/pytorch/strategies/base/model_agent.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, fields -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Any, Optional import numpy as np import torch @@ -12,7 +12,7 @@ from lmdeploy.pytorch.engine.logits_process import SamplingInputs from lmdeploy.pytorch.messages import SchedulerSequence from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta - SeqList = List[SchedulerSequence] + SeqList = list[SchedulerSequence] def to_device(self, device: str, non_blocking: bool = False): @@ -103,7 +103,7 @@ def step(self, token_ids: torch.Tensor, stop_words: torch.Tensor, inputs: Optional['ModelInputs'] = None, - extra_inputs: Optional[ExtraInputs] = None): + extra_inputs: ExtraInputs | None = None): """Check whether to stop generation.""" pass @@ -122,7 +122,7 @@ def slice_outputs(self, inputs: torch.Tensor, seq_length: torch.LongTensor) -> t @abstractmethod def slice_extra_inputs(self, extra_inputs: ExtraInputs, model_inputs: 'ModelInputs', - model_outputs: Dict[str, torch.Tensor], **kwargs) -> ExtraInputs: + model_outputs: dict[str, torch.Tensor], **kwargs) -> ExtraInputs: """Slice outputs.""" pass @@ -163,14 +163,14 @@ def update_prefill_for_next_step( next_token_ids: torch.Tensor, model_metas: Any, extra_outputs: ExtraOutputs, - ) -> Tuple['ModelInputs', ExtraInputs]: + ) -> tuple['ModelInputs', ExtraInputs]: """Step next decoding.""" pass @abstractmethod def update_decoding_for_next_step(self, model_inputs: 'ModelInputs', next_token_ids: torch.Tensor, model_metas: Any, extra_inputs: ExtraInputs, - extra_outputs: ExtraOutputs) -> Tuple['ModelInputs', ExtraInputs]: + extra_outputs: ExtraOutputs) -> tuple['ModelInputs', ExtraInputs]: """Step next inputs.""" pass diff --git a/lmdeploy/pytorch/strategies/base/sampling.py b/lmdeploy/pytorch/strategies/base/sampling.py index 2948627870..bf6c4aac78 100644 --- a/lmdeploy/pytorch/strategies/base/sampling.py +++ b/lmdeploy/pytorch/strategies/base/sampling.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABC, abstractmethod -from typing import List import torch @@ -10,7 +9,7 @@ from .model_agent import ExtraInputs -SeqList = List[SchedulerSequence] +SeqList = list[SchedulerSequence] class SamplingStrategy(ABC): diff --git a/lmdeploy/pytorch/strategies/base/sequence.py b/lmdeploy/pytorch/strategies/base/sequence.py index 8a19e69356..46fec916af 100644 --- a/lmdeploy/pytorch/strategies/base/sequence.py +++ b/lmdeploy/pytorch/strategies/base/sequence.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING from lmdeploy.pytorch.disagg.conn.protocol import MigrationRequest @@ -8,7 +8,7 @@ from lmdeploy.pytorch.engine.model_agent import BatchedOutputs from lmdeploy.pytorch.messages import SamplingParam, SchedulerSequence, SchedulerSession from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta - SeqList = List[SchedulerSequence] + SeqList = list[SchedulerSequence] class SequenceStrategy(ABC): @@ -19,7 +19,7 @@ def make_sequence(self, session: 'SchedulerSession', sampling_param: 'SamplingParam' = None, adapter_name: str = None, - migration_request: Optional[MigrationRequest] = None, + migration_request: MigrationRequest | None = None, resp_cache: bool = False, preserve_cache: bool = False) -> 'SchedulerSequence': """Make sequence.""" diff --git a/lmdeploy/pytorch/strategies/dllm/__init__.py b/lmdeploy/pytorch/strategies/dllm/__init__.py index dc0395a017..e1c9b9adcd 100644 --- a/lmdeploy/pytorch/strategies/dllm/__init__.py +++ b/lmdeploy/pytorch/strategies/dllm/__init__.py @@ -6,12 +6,12 @@ from lmdeploy.utils import get_logger if TYPE_CHECKING: + from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig from lmdeploy.pytorch.strategies.base.cudagraph import CudagraphStrategy + from lmdeploy.pytorch.strategies.base.engine import EngineStrategy + from lmdeploy.pytorch.strategies.base.model_agent import ModelAgentStrategy from lmdeploy.pytorch.strategies.base.model_inputs import ModelInputsStrategy from lmdeploy.pytorch.strategies.base.sampling import SamplingStrategy - from lmdeploy.pytorch.strategies.base.model_agent import ModelAgentStrategy - from lmdeploy.pytorch.strategies.base.engine import EngineStrategy - from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig from ..base import StrategyFactoryBase diff --git a/lmdeploy/pytorch/strategies/dllm/model_agent.py b/lmdeploy/pytorch/strategies/dllm/model_agent.py index e1588300a2..3371997341 100644 --- a/lmdeploy/pytorch/strategies/dllm/model_agent.py +++ b/lmdeploy/pytorch/strategies/dllm/model_agent.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from contextlib import contextmanager from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple +from typing import Any import numpy as np import torch @@ -18,7 +18,7 @@ from ..base.model_agent import ExtraInputs, ExtraOutputs, ModelAgentStrategy, StoppingCriteria from .unmasking import UnmaskingProcessor -SeqList = List[SchedulerSequence] +SeqList = list[SchedulerSequence] def get_model_inputs_next_decoding(inputs: ModelInputs, input_ids: torch.Tensor, max_q_seqlen, @@ -121,8 +121,8 @@ def update(self, delta: 'ModelInputsDelta') -> 'DLLMStoppingCriteria': def step(self, token_ids: torch.Tensor, stop_words: torch.Tensor, - inputs: Optional[ModelInputs] = None, - extra_inputs: Optional[DLLMExtraInputs] = None): + inputs: ModelInputs | None = None, + extra_inputs: DLLMExtraInputs | None = None): """Check whether to stop generation.""" num_appendable_ids = self.num_appendable_ids output_start_pos = self.output_start_pos @@ -198,7 +198,7 @@ def slice_outputs(self, inputs: torch.Tensor, seq_length: torch.LongTensor) -> t return inputs def slice_extra_inputs(self, extra_inputs: DLLMExtraInputs, model_inputs: ModelInputs, - model_outputs: Dict[str, torch.Tensor], **kwargs) -> DLLMExtraInputs: + model_outputs: dict[str, torch.Tensor], **kwargs) -> DLLMExtraInputs: """Slice outputs.""" dllm_mask = self.slice_outputs(extra_inputs.dllm_mask, model_inputs.seq_length) return DLLMExtraInputs(dllm_mask=dllm_mask) @@ -269,7 +269,7 @@ def update_prefill_for_next_step( next_token_ids: torch.Tensor, model_metas: Any, extra_outputs: DLLMExtraOutputs, - ) -> Tuple['ModelInputs', DLLMExtraInputs]: + ) -> tuple['ModelInputs', DLLMExtraInputs]: """Step next decoding.""" dllm_mask = extra_outputs.dllm_mask next_token_ids, dllm_mask, step_seqlens = self._update_dllm(next_token_ids, dllm_mask, model_inputs.seq_length) diff --git a/lmdeploy/pytorch/strategies/dllm/sampling.py b/lmdeploy/pytorch/strategies/dllm/sampling.py index d7c8bc4716..ab5174f017 100644 --- a/lmdeploy/pytorch/strategies/dllm/sampling.py +++ b/lmdeploy/pytorch/strategies/dllm/sampling.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List import numpy as np import torch @@ -12,7 +11,7 @@ from ..ar.sampling import ARSamplingStrategy from .model_agent import DLLMExtraInputs -SeqList = List[SchedulerSequence] +SeqList = list[SchedulerSequence] class DLLMSamplingStrategy(ARSamplingStrategy): diff --git a/lmdeploy/pytorch/strategies/dllm/sequence.py b/lmdeploy/pytorch/strategies/dllm/sequence.py index 03ad19e75d..4b6ac470b4 100644 --- a/lmdeploy/pytorch/strategies/dllm/sequence.py +++ b/lmdeploy/pytorch/strategies/dllm/sequence.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import time from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional +from typing import Any import numpy as np from torch import Tensor @@ -9,14 +9,22 @@ from lmdeploy.pytorch import consts from lmdeploy.pytorch.disagg.conn.protocol import MigrationRequest from lmdeploy.pytorch.engine.model_agent import BatchedOutputs -from lmdeploy.pytorch.messages import (HistoryTokenIds, InputEmbeddings, MessageStatus, MultiModalInputs, SamplingParam, - SchedulerSession, UpdateTokenMode, _to_ndarray) +from lmdeploy.pytorch.messages import ( + HistoryTokenIds, + InputEmbeddings, + MessageStatus, + MultiModalInputs, + SamplingParam, + SchedulerSession, + UpdateTokenMode, + _to_ndarray, +) from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta from ..ar.sequence import SchedulerSequenceDefault from ..base.sequence import SequenceStrategy -SeqList = List['SchedulerSequenceDLLM'] +SeqList = list['SchedulerSequenceDLLM'] DLLM_MASKED = consts.DLLM_MASKED DLLM_UNMASKED = consts.DLLM_UNMASKED @@ -165,8 +173,8 @@ def _update_token_ids_prefill(self, token_ids: np.ndarray, dllm_mask: np.ndarray def update_token_ids(self, token_ids: Tensor, multimodals: MultiModalInputs = None, - embeddings: List[InputEmbeddings] = None, - model_meta: Dict[str, Any] = None, + embeddings: list[InputEmbeddings] = None, + model_meta: dict[str, Any] = None, dllm_mask: Tensor = None, mode: UpdateTokenMode = UpdateTokenMode.INPUTS, **kwargs): @@ -216,7 +224,7 @@ def make_sequence(self, session: 'SchedulerSession', sampling_param: 'SamplingParam' = None, adapter_name: str = None, - migration_request: Optional[MigrationRequest] = None, + migration_request: MigrationRequest | None = None, resp_cache: bool = False, preserve_cache: bool = False) -> 'SchedulerSequenceDLLM': """Make sequence.""" diff --git a/lmdeploy/pytorch/tools/utils.py b/lmdeploy/pytorch/tools/utils.py index 6d0c9d836c..8b71f75859 100644 --- a/lmdeploy/pytorch/tools/utils.py +++ b/lmdeploy/pytorch/tools/utils.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. from contextlib import contextmanager -from typing import List class Timer: @@ -187,7 +186,7 @@ def _print_meta(out: Response): # Main loop print(colored('━' * term_size, border_color)) - outputs: List[Response] = outputs + outputs: list[Response] = outputs for idx, out in enumerate(outputs): header = f'OUTPUT [{idx + 1}/{len(outputs)}]' header_formatted = colored(f'✦ {header}', 'light_magenta', attrs=['bold']) diff --git a/lmdeploy/pytorch/transformers/__init__.py b/lmdeploy/pytorch/transformers/__init__.py index bfafdb1899..994d8ffd0d 100644 --- a/lmdeploy/pytorch/transformers/__init__.py +++ b/lmdeploy/pytorch/transformers/__init__.py @@ -6,7 +6,7 @@ from lmdeploy.utils import get_logger -@lru_cache() +@lru_cache def register_config(model_type: str): if model_type == 'deepseek_v32': from lmdeploy.pytorch.transformers.configuration_deepseek_v32 import DeepseekV32Config diff --git a/lmdeploy/pytorch/utils.py b/lmdeploy/pytorch/utils.py index c72aafe7da..9c60d2d001 100644 --- a/lmdeploy/pytorch/utils.py +++ b/lmdeploy/pytorch/utils.py @@ -2,9 +2,10 @@ # modify from: https://github.com/vllm-project/vllm import asyncio import inspect +from collections.abc import Sequence from contextlib import contextmanager from inspect import Parameter, Signature -from typing import Dict, Generic, Optional, Sequence, TypeVar +from typing import Generic, TypeVar import psutil @@ -26,7 +27,7 @@ def get_cpu_memory() -> int: return psutil.virtual_memory().total -def bind_sigature(input_names: str, args: Sequence, kwargs: Dict): +def bind_sigature(input_names: str, args: Sequence, kwargs: dict): """Bind args and kwargs to given input names.""" kind = inspect._ParameterKind.POSITIONAL_OR_KEYWORD @@ -59,14 +60,14 @@ def get_instance(*args, **kwargs): class CtxMgrBase(Generic[T]): """Context manager base class.""" - def __init__(self, default: Optional[T] = None): + def __init__(self, default: T | None = None): self._context = default - def current_context(self) -> Optional[T]: + def current_context(self) -> T | None: """Get current context.""" return self._context - def set_context(self, context: Optional[T]): + def set_context(self, context: T | None): """Set current context.""" self._context = context diff --git a/lmdeploy/pytorch/weight_loader/model_weight_loader.py b/lmdeploy/pytorch/weight_loader/model_weight_loader.py index 4b6d040a8d..cf2adde982 100644 --- a/lmdeploy/pytorch/weight_loader/model_weight_loader.py +++ b/lmdeploy/pytorch/weight_loader/model_weight_loader.py @@ -68,7 +68,7 @@ def _get_weight_map(model_path: str, weight_type: str): else: raise RuntimeError(f'Unsupported weight type: {weight_type}.') - with open(load_index, mode='r', encoding='utf-8') as f: + with open(load_index, encoding='utf-8') as f: index = json.load(f) weight_map = index['weight_map'] diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py index 05c3485fdb..4a4f466d92 100644 --- a/lmdeploy/serve/core/async_engine.py +++ b/lmdeploy/serve/core/async_engine.py @@ -6,19 +6,29 @@ import random from contextlib import asynccontextmanager from copy import deepcopy -from typing import Any, Dict, List, Literal +from typing import Any, Literal import torch from lmdeploy.archs import get_model_arch from lmdeploy.logger import RequestLogger -from lmdeploy.messages import (EngineOutput, GenerationConfig, PytorchEngineConfig, Response, ResponseType, - SpeculativeConfig, TurbomindEngineConfig) +from lmdeploy.messages import ( + EngineOutput, + GenerationConfig, + PytorchEngineConfig, + Response, + ResponseType, + SpeculativeConfig, + TurbomindEngineConfig, +) from lmdeploy.metrics.metrics_processor import metrics_processor from lmdeploy.metrics.stats import IterationStats, RequestStats, SpeculativeDecodingStats from lmdeploy.model import ChatTemplateConfig, get_chat_template -from lmdeploy.pytorch.disagg.conn.protocol import (DistServeConnectionRequest, DistServeDropConnectionRequest, - DistServeInitRequest) +from lmdeploy.pytorch.disagg.conn.protocol import ( + DistServeConnectionRequest, + DistServeDropConnectionRequest, + DistServeInitRequest, +) from lmdeploy.serve.managers import Session, SessionManager from lmdeploy.serve.processors import MultimodalProcessor from lmdeploy.tokenizer import DetokenizeState, Tokenizer @@ -37,11 +47,11 @@ class GenOut: input_token_len: int generate_token_len: int finish_reason: Literal['stop', 'length', 'error'] | None = None - token_ids: List[int] | None = None - logprobs: List[Dict[int, float]] | None = None + token_ids: list[int] | None = None + logprobs: list[dict[int, float]] | None = None logits: Any = None last_hidden_state: Any = None - cache_block_ids: List[int] | None = None # for disaggregation + cache_block_ids: list[int] | None = None # for disaggregation routed_experts: Any = None # for RL router replay def to_response(self, index: int = 0) -> Response: @@ -218,7 +228,7 @@ def sleep(self, level: int = 1): self.sleeping_tags = {'weights', 'kv_cache'} self.is_sleeping = True - def wakeup(self, tags: List[str] | None = None): + def wakeup(self, tags: list[str] | None = None): """Wake up the model. Args: @@ -282,7 +292,7 @@ async def generate( messages, session_id: int | Session, gen_config: GenerationConfig | None = None, - tools: List[object] | None = None, + tools: list[object] | None = None, reasoning_effort: Literal['low', 'medium', 'high'] | None = None, stream_response: bool = True, sequence_start: bool = True, @@ -291,10 +301,10 @@ async def generate( do_preprocess: bool = True, adapter_name: str | None = None, rewind_stop_tokens: bool = False, - input_ids: List | None = None, + input_ids: list | None = None, enable_thinking: bool | None = None, - chat_template_kwargs: Dict | None = None, - mm_processor_kwargs: Dict[str, Any] | None = None, + chat_template_kwargs: dict | None = None, + mm_processor_kwargs: dict[str, Any] | None = None, **kwargs): """Generate responses. @@ -577,21 +587,21 @@ def free_cache(self, session_id: int): def p2p_initialize(self, init_request: DistServeInitRequest): return self.engine.p2p_initialize(init_request) - def p2p_connect(self, conn_request: List[DistServeConnectionRequest]): + def p2p_connect(self, conn_request: list[DistServeConnectionRequest]): return self.engine.p2p_connect(conn_request) - def p2p_drop_connect(self, drop_conn_request: List[DistServeDropConnectionRequest]): + def p2p_drop_connect(self, drop_conn_request: list[DistServeDropConnectionRequest]): return self.engine.p2p_drop_connect(drop_conn_request) """ DistServe Async Engine API End """ - async def async_get_reward_score(self, input_ids: List) -> List[float]: + async def async_get_reward_score(self, input_ids: list) -> list[float]: """Async version of get_reward_score.""" supported_reward_models = ['InternLM2ForRewardModel', 'Qwen2ForRewardModel'] if self.arch not in supported_reward_models: raise ValueError(f'{self.arch} is not in reward model list: {supported_reward_models}') - assert isinstance(input_ids, List) - assert all(isinstance(x, int) for x in input_ids) or all(isinstance(x, List) for x in input_ids) + assert isinstance(input_ids, list) + assert all(isinstance(x, int) for x in input_ids) or all(isinstance(x, list) for x in input_ids) # Make input_ids a list of token_id list input_ids = [input_ids] if isinstance(input_ids[0], int) else input_ids @@ -603,10 +613,10 @@ async def async_get_reward_score(self, input_ids: List) -> List[float]: async def async_get_logits(self, input_ids, - sessions: List['Session'] | None = None, + sessions: list['Session'] | None = None, sequence_start: bool = True, - sequence_end: bool = True) -> List[torch.Tensor]: - assert input_ids and all(isinstance(_, List) for _ in input_ids) + sequence_end: bool = True) -> list[torch.Tensor]: + assert input_ids and all(isinstance(_, list) for _ in input_ids) assert sessions is None or (len(sessions) == len(input_ids)) logits = [None] * len(input_ids) diff --git a/lmdeploy/serve/managers/session_manager.py b/lmdeploy/serve/managers/session_manager.py index 7dfefc767c..0ac7e1465f 100644 --- a/lmdeploy/serve/managers/session_manager.py +++ b/lmdeploy/serve/managers/session_manager.py @@ -5,7 +5,7 @@ import itertools import weakref from contextlib import asynccontextmanager -from typing import Any, List, Tuple +from typing import Any from lmdeploy.messages import GenerationConfig, Response from lmdeploy.serve.core.exceptions import SafeRunException @@ -21,7 +21,7 @@ def __init__(self, session_id: int, session_mgr: SessionManager, **kwargs): self.session_id = session_id self.prompt: Any = None self.response: Response | None = None - self.history: List[Tuple[Any, str]] = [] + self.history: list[tuple[Any, str]] = [] self.gen_config: GenerationConfig | None = None self.step: int = 0 # event to wait for the session to be active diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py index b03161f261..fbe5eaf8b8 100644 --- a/lmdeploy/serve/openai/api_client.py +++ b/lmdeploy/serve/openai/api_client.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import json -from typing import Any, Dict, List, Optional, Union +from typing import Any import requests @@ -45,7 +45,7 @@ class APIClient: api key will be used. """ - def __init__(self, api_server_url: str, api_key: Optional[str] = None, **kwargs): + def __init__(self, api_server_url: str, api_key: str | None = None, **kwargs): self.api_server_url = api_server_url self.chat_completions_v1_url = f'{api_server_url}/v1/chat/completions' self.completions_v1_url = f'{api_server_url}/v1/completions' @@ -66,13 +66,13 @@ def available_models(self): return self._available_models def encode(self, - input: Union[str, List[str]], - do_preprocess: Optional[bool] = False, - add_bos: Optional[bool] = True): + input: str | list[str], + do_preprocess: bool | None = False, + add_bos: bool | None = True): """Encode prompts. Args: - input: the prompt to be encoded. In str or List[str] format. + input: the prompt to be encoded. In str or list[str] format. do_preprocess: whether do preprocess or not. Default to False. add_bos: True when it is the beginning of a conversation. False when it is not. Default to True. @@ -90,28 +90,28 @@ def encode(self, def chat_completions_v1( self, model: str, - messages: Union[str, List[Dict[str, str]]], - temperature: Optional[float] = 0.7, - top_p: Optional[float] = 1.0, - logprobs: Optional[bool] = False, - top_logprobs: Optional[int] = 0, - n: Optional[int] = 1, - max_completion_tokens: Optional[int] = None, - max_tokens: Optional[int] = None, - stop: Optional[Union[str, List[str]]] = None, - stream: Optional[bool] = False, - presence_penalty: Optional[float] = 0.0, - frequency_penalty: Optional[float] = 0.0, - user: Optional[str] = None, - repetition_penalty: Optional[float] = 1.0, - ignore_eos: Optional[bool] = False, - skip_special_tokens: Optional[bool] = True, - spaces_between_special_tokens: Optional[bool] = True, + messages: str | list[dict[str, str]], + temperature: float | None = 0.7, + top_p: float | None = 1.0, + logprobs: bool | None = False, + top_logprobs: int | None = 0, + n: int | None = 1, + max_completion_tokens: int | None = None, + max_tokens: int | None = None, + stop: str | list[str] | None = None, + stream: bool | None = False, + presence_penalty: float | None = 0.0, + frequency_penalty: float | None = 0.0, + user: str | None = None, + repetition_penalty: float | None = 1.0, + ignore_eos: bool | None = False, + skip_special_tokens: bool | None = True, + spaces_between_special_tokens: bool | None = True, top_k: int = 40, - min_new_tokens: Optional[int] = None, + min_new_tokens: int | None = None, min_p: float = 0.0, - logit_bias: Optional[Dict[str, float]] = None, - stream_options: Optional[Dict] = None, + logit_bias: dict[str, float] | None = None, + stream_options: dict | None = None, **kwargs, ): """Chat completion v1. @@ -130,7 +130,7 @@ def chat_completions_v1( max_completion_tokens (int | None): output token nums. Default to None. max_tokens (int | None): output token nums. Default to None. Deprecated: Use max_completion_tokens instead. - stop (str | List[str] | None): To stop generating further + stop (str | list[str] | None): To stop generating further tokens. Only accept stop words that's encoded to one token idex. repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty @@ -148,7 +148,7 @@ def chat_completions_v1( 0 and 1. Typical values are in the 0.01-0.2 range, comparably selective as setting `top_p` in the 0.99-0.8 range (use the opposite of normal `top_p` values) - logit_bias (Dict): Bias to logits. Only supported in pytorch engine. + logit_bias (dict): Bias to logits. Only supported in pytorch engine. stream_options: Options for streaming response. Only set this when you set stream: true. @@ -175,23 +175,23 @@ def chat_completions_v1( def completions_v1( self, model: str, - prompt: Union[str, List[Any]], - suffix: Optional[str] = None, - temperature: Optional[float] = 0.7, - n: Optional[int] = 1, - max_completion_tokens: Optional[int] = 16, - max_tokens: Optional[int] = 16, - stream: Optional[bool] = False, - stop: Optional[Union[str, List[str]]] = None, - top_p: Optional[float] = 1.0, - top_k: Optional[int] = 40, - user: Optional[str] = None, + prompt: str | list[Any], + suffix: str | None = None, + temperature: float | None = 0.7, + n: int | None = 1, + max_completion_tokens: int | None = 16, + max_tokens: int | None = 16, + stream: bool | None = False, + stop: str | list[str] | None = None, + top_p: float | None = 1.0, + top_k: int | None = 40, + user: str | None = None, # additional argument of lmdeploy - repetition_penalty: Optional[float] = 1.0, - ignore_eos: Optional[bool] = False, - skip_special_tokens: Optional[bool] = True, - spaces_between_special_tokens: Optional[bool] = True, - stream_options: Optional[Dict] = None, + repetition_penalty: float | None = 1.0, + ignore_eos: bool | None = False, + skip_special_tokens: bool | None = True, + spaces_between_special_tokens: bool | None = True, + stream_options: dict | None = None, **kwargs, ): """Chat completion v1. @@ -213,7 +213,7 @@ def completions_v1( n (int): How many chat completion choices to generate for each input message. Only support one here. stream: whether to stream the results or not. Default to false. - stop (str | List[str] | None): To stop generating further + stop (str | list[str] | None): To stop generating further tokens. Only accept stop words that's encoded to one token idex. repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 7b8b2cd9db..6b02fc600f 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -6,10 +6,11 @@ import os import re import time +from collections.abc import AsyncGenerator from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus -from typing import AsyncGenerator, Literal +from typing import Literal import uvicorn from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request, status @@ -21,26 +22,58 @@ from starlette.routing import Mount from lmdeploy.archs import get_task -from lmdeploy.messages import (GenerationConfig, LogitsProcessor, PytorchEngineConfig, SpeculativeConfig, - TurbomindEngineConfig) +from lmdeploy.messages import ( + GenerationConfig, + LogitsProcessor, + PytorchEngineConfig, + SpeculativeConfig, + TurbomindEngineConfig, +) from lmdeploy.metrics.metrics_processor import metrics_processor from lmdeploy.model import ChatTemplateConfig from lmdeploy.pytorch.disagg.config import DistServeEngineConfig -from lmdeploy.pytorch.disagg.conn.protocol import (DistServeCacheFreeRequest, DistServeConnectionRequest, - DistServeDropConnectionRequest, DistServeInitRequest, - MigrationRequest) +from lmdeploy.pytorch.disagg.conn.protocol import ( + DistServeCacheFreeRequest, + DistServeConnectionRequest, + DistServeDropConnectionRequest, + DistServeInitRequest, + MigrationRequest, +) from lmdeploy.serve.core import AsyncEngine from lmdeploy.serve.openai.harmony_utils import GptOssChatParser -from lmdeploy.serve.openai.protocol import ChatCompletionResponse # noqa: E501 -from lmdeploy.serve.openai.protocol import (AbortRequest, ChatCompletionRequest, ChatCompletionResponseChoice, - ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, - ChatCompletionTokenLogprob, ChatMessage, ChoiceLogprobs, CompletionRequest, - CompletionResponse, CompletionResponseChoice, - CompletionResponseStreamChoice, CompletionStreamResponse, DeltaMessage, - EmbeddingsRequest, EncodeRequest, EncodeResponse, ErrorResponse, - GenerateReqInput, GenerateReqMetaOutput, GenerateReqOutput, LogProbs, - ModelCard, ModelList, ModelPermission, PoolingRequest, PoolingResponse, - TopLogprob, UpdateParamsRequest, UsageInfo) +from lmdeploy.serve.openai.protocol import ( + AbortRequest, + ChatCompletionRequest, + ChatCompletionResponse, # noqa: E501 + ChatCompletionResponseChoice, + ChatCompletionResponseStreamChoice, + ChatCompletionStreamResponse, + ChatCompletionTokenLogprob, + ChatMessage, + ChoiceLogprobs, + CompletionRequest, + CompletionResponse, + CompletionResponseChoice, + CompletionResponseStreamChoice, + CompletionStreamResponse, + DeltaMessage, + EmbeddingsRequest, + EncodeRequest, + EncodeResponse, + ErrorResponse, + GenerateReqInput, + GenerateReqMetaOutput, + GenerateReqOutput, + LogProbs, + ModelCard, + ModelList, + ModelPermission, + PoolingRequest, + PoolingResponse, + TopLogprob, + UpdateParamsRequest, + UsageInfo, +) from lmdeploy.serve.openai.reasoning_parser.reasoning_parser import ReasoningParser, ReasoningParserManager from lmdeploy.serve.openai.tool_parser.tool_parser import ToolParser, ToolParserManager from lmdeploy.serve.utils.server_utils import validate_json_request @@ -159,8 +192,8 @@ def _create_completion_logprobs(tokenizer: Tokenizer, Args: tokenizer (Tokenizer): tokenizer. - token_ids (List[int]): output token ids. - logprobs (List[Dict[int, float]]): the top logprobs for each output + token_ids (list[int]): output token ids. + logprobs (list[dict[int, float]]): the top logprobs for each output position. skip_special_tokens (bool): Whether or not to remove special tokens in the decoding. Default to be True. @@ -213,8 +246,8 @@ def _create_chat_completion_logprobs(tokenizer: Tokenizer, Args: tokenizer (Tokenizer): tokenizer. - token_ids (List[int]): output token ids. - logprobs (List[Dict[int, float]]): the top logprobs for each output + token_ids (list[int]): output token ids. + logprobs (list[dict[int, float]]): the top logprobs for each output position. Returns: ChoiceLogprobs: logprob result. @@ -318,7 +351,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque Deprecated: Use max_completion_tokens instead. - **repetition_penalty** (float): The parameter for repetition penalty. 1.0 means no penalty - - **stop** (str | List[str] | None): To stop generating further + - **stop** (str | list[str] | None): To stop generating further tokens. Only accept stop words that's encoded to one token idex. - **response_format** (dict | None): To generate response according to given schema. Examples: @@ -1413,13 +1446,13 @@ def serve(model_path: str, server_name (str): host ip for serving server_port (int): server port tp (int): tensor parallel - allow_origins (List[str]): a list of allowed origins for CORS + allow_origins (list[str]): a list of allowed origins for CORS allow_credentials (bool): whether to allow credentials for CORS - allow_methods (List[str]): a list of allowed HTTP methods for CORS - allow_headers (List[str]): a list of allowed HTTP headers for CORS + allow_methods (list[str]): a list of allowed HTTP methods for CORS + allow_headers (list[str]): a list of allowed HTTP headers for CORS log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG] - api_keys (List[str] | str | None): Optional list of API keys. Accepts + api_keys (list[str] | str | None): Optional list of API keys. Accepts string type as a single api_key. Default to None, which means no api key applied. ssl (bool): Enable SSL. Requires OS Environment variables diff --git a/lmdeploy/serve/openai/harmony_utils.py b/lmdeploy/serve/openai/harmony_utils.py index ebd28ebb6f..2810725c0f 100644 --- a/lmdeploy/serve/openai/harmony_utils.py +++ b/lmdeploy/serve/openai/harmony_utils.py @@ -1,12 +1,17 @@ # Copyright (c) OpenMMLab. All rights reserved. # Modified from https://github.com/vllm-project/vllm/blob/v0.10.2rc1/vllm/entrypoints/harmony_utils.py -from typing import List import shortuuid from openai_harmony import HarmonyEncodingName, Role, StreamableParser, load_harmony_encoding -from lmdeploy.serve.openai.protocol import (ChatMessage, DeltaFunctionCall, DeltaMessage, DeltaToolCall, FunctionCall, - ToolCall) +from lmdeploy.serve.openai.protocol import ( + ChatMessage, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + FunctionCall, + ToolCall, +) _harmony_encoding = None @@ -27,7 +32,7 @@ class GptOssChatParser: def __init__(self): self.parser = get_streamable_parser_for_assistant() - def parse_streaming(self, tokens: List[int]) -> DeltaMessage: + def parse_streaming(self, tokens: list[int]) -> DeltaMessage: parser = self.parser delta_message = DeltaMessage(role='assistant') content = '' @@ -76,7 +81,7 @@ def parse_streaming(self, tokens: List[int]) -> DeltaMessage: delta_message.tool_calls = tool_calls return delta_message - def parse_full(self, tokens: List[int]) -> ChatMessage: + def parse_full(self, tokens: list[int]) -> ChatMessage: delta_message = self.parse_streaming(tokens) tool_calls = [] for delta_tool_call in delta_message.tool_calls: diff --git a/lmdeploy/serve/openai/launch_server.py b/lmdeploy/serve/openai/launch_server.py index 2d2fd56c3f..011180902d 100644 --- a/lmdeploy/serve/openai/launch_server.py +++ b/lmdeploy/serve/openai/launch_server.py @@ -6,7 +6,6 @@ import signal import socket import sys -from typing import List, Union from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig from lmdeploy.utils import get_logger @@ -16,7 +15,7 @@ logger = get_logger('lmdeploy') -def find_available_ports(num: int) -> List[int]: +def find_available_ports(num: int) -> list[int]: """Find available port.""" def __is_port_ok(port: int): @@ -47,7 +46,7 @@ def get_host_ip(): return ip -def _run_server(gpu_ids: List[int], model_path: str, **kwargs): +def _run_server(gpu_ids: list[int], model_path: str, **kwargs): """Launch a server process.""" cuda_visible_devices = ','.join([str(_) for _ in gpu_ids]) os.setpgrp() @@ -56,7 +55,7 @@ def _run_server(gpu_ids: List[int], model_path: str, **kwargs): serve(model_path, **kwargs) -def cleanup_processes(processes: List[mp.Process]): +def cleanup_processes(processes: list[mp.Process]): """Clean up server process.""" for process in processes: logger.info(f'Terminating process group {process.pid}') @@ -83,7 +82,7 @@ def cleanup_processes(processes: List[mp.Process]): def launch_server(num_nodes: int, node_rank: int, model_path: str, - backend_config: Union[PytorchEngineConfig, TurbomindEngineConfig], + backend_config: PytorchEngineConfig | TurbomindEngineConfig, proxy_url: str = None, **kwargs): """Run multiple server processes in dp mode.""" diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py index 5f3d252e7b..b6014b6a79 100644 --- a/lmdeploy/serve/openai/protocol.py +++ b/lmdeploy/serve/openai/protocol.py @@ -2,7 +2,7 @@ # Modified from # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py import time -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, Literal import shortuuid from pydantic import BaseModel, ConfigDict, Field @@ -13,7 +13,7 @@ class ErrorResponse(BaseModel): message: str type: str code: int - param: Optional[str] = None + param: str | None = None object: str = 'error' @@ -29,7 +29,7 @@ class ModelPermission(BaseModel): allow_view: bool = True allow_fine_tuning: bool = False organization: str = '*' - group: Optional[str] = None + group: str | None = None is_blocking: bool = False @@ -39,29 +39,29 @@ class ModelCard(BaseModel): object: str = 'model' created: int = Field(default_factory=lambda: int(time.time())) owned_by: str = 'lmdeploy' - root: Optional[str] = None - parent: Optional[str] = None - permission: List[ModelPermission] = [] + root: str | None = None + parent: str | None = None + permission: list[ModelPermission] = [] class ModelList(BaseModel): """Model list consists of model cards.""" object: str = 'list' - data: List[ModelCard] = [] + data: list[ModelCard] = [] class UsageInfo(BaseModel): """Usage information.""" prompt_tokens: int = 0 total_tokens: int = 0 - completion_tokens: Optional[int] = 0 + completion_tokens: int | None = 0 class Function(BaseModel): """Function descriptions.""" - description: Optional[str] = Field(default=None, examples=[None]) + description: str | None = Field(default=None, examples=[None]) name: str - parameters: Optional[Dict[str, Any]] = None + parameters: dict[str, Any] | None = None class Tool(BaseModel): @@ -83,82 +83,82 @@ class ToolChoice(BaseModel): class StreamOptions(BaseModel): """The stream options.""" - include_usage: Optional[bool] = False + include_usage: bool | None = False class JsonSchema(BaseModel): name: str # description is not used since it depends on model - description: Optional[str] = None + description: str | None = None # `schema` is a reserved field in Pydantic BaseModel # use alias since pydantic does not support the OpenAI key `schema` - json_schema: Optional[Dict[str, Any]] = Field(default=None, alias='schema', examples=[None]) + json_schema: dict[str, Any] | None = Field(default=None, alias='schema', examples=[None]) # strict is not used - strict: Optional[bool] = False + strict: bool | None = False model_config = ConfigDict(serialize_by_alias=True) class ResponseFormat(BaseModel): # regex_schema is extended by lmdeploy to support regex output type: Literal['text', 'json_object', 'json_schema', 'regex_schema'] - json_schema: Optional[JsonSchema] = None - regex_schema: Optional[str] = None + json_schema: JsonSchema | None = None + regex_schema: str | None = None class ChatCompletionRequest(BaseModel): """Chat completion request.""" model: str - messages: Union[str, List[Dict[str, Any]]] = Field(examples=[[{'role': 'user', 'content': 'hi'}]]) - temperature: Optional[float] = 0.7 - top_p: Optional[float] = 1.0 - tools: Optional[List[Tool]] = Field(default=None, examples=[None]) - tool_choice: Union[ToolChoice, Literal['auto', 'required', 'none']] = Field(default='auto', examples=['none']) - logprobs: Optional[bool] = False - top_logprobs: Optional[int] = None - n: Optional[int] = 1 - logit_bias: Optional[Dict[str, float]] = Field(default=None, examples=[None]) - max_completion_tokens: Optional[int] = Field( + messages: str | list[dict[str, Any]] = Field(examples=[[{'role': 'user', 'content': 'hi'}]]) + temperature: float | None = 0.7 + top_p: float | None = 1.0 + tools: list[Tool] | None = Field(default=None, examples=[None]) + tool_choice: ToolChoice | Literal['auto', 'required', 'none'] = Field(default='auto', examples=['none']) + logprobs: bool | None = False + top_logprobs: int | None = None + n: int | None = 1 + logit_bias: dict[str, float] | None = Field(default=None, examples=[None]) + max_completion_tokens: int | None = Field( default=None, examples=[None], description=('An upper bound for the number of tokens that can be generated for a completion, ' 'including visible output tokens and reasoning tokens'), ) - max_tokens: Optional[int] = Field( + max_tokens: int | None = Field( default=None, examples=[None], deprecated='max_tokens is deprecated in favor of the max_completion_tokens field', ) - stop: Optional[Union[str, List[str]]] = Field(default=None, examples=[None]) - - stream: Optional[bool] = False - stream_options: Optional[StreamOptions] = Field(default=None, examples=[None]) - presence_penalty: Optional[float] = 0.0 - frequency_penalty: Optional[float] = 0.0 - user: Optional[str] = None - reasoning_effort: Optional[Literal['low', 'medium', 'high']] = None - response_format: Optional[ResponseFormat] = Field(default=None, examples=[None]) + stop: str | list[str] | None = Field(default=None, examples=[None]) + + stream: bool | None = False + stream_options: StreamOptions | None = Field(default=None, examples=[None]) + presence_penalty: float | None = 0.0 + frequency_penalty: float | None = 0.0 + user: str | None = None + reasoning_effort: Literal['low', 'medium', 'high'] | None = None + response_format: ResponseFormat | None = Field(default=None, examples=[None]) # additional argument of lmdeploy - do_preprocess: Optional[bool] = True - repetition_penalty: Optional[float] = 1.0 - session_id: Optional[int] = -1 - ignore_eos: Optional[bool] = False - skip_special_tokens: Optional[bool] = True - spaces_between_special_tokens: Optional[bool] = True - top_k: Optional[int] = 40 - seed: Optional[int] = None - min_new_tokens: Optional[int] = Field(default=None, examples=[None]) + do_preprocess: bool | None = True + repetition_penalty: float | None = 1.0 + session_id: int | None = -1 + ignore_eos: bool | None = False + skip_special_tokens: bool | None = True + spaces_between_special_tokens: bool | None = True + top_k: int | None = 40 + seed: int | None = None + min_new_tokens: int | None = Field(default=None, examples=[None]) min_p: float = 0.0 - enable_thinking: Optional[bool] = None # will be deprecated in the future - return_token_ids: Optional[bool] = False - include_stop_str_in_output: Optional[bool] = False + enable_thinking: bool | None = None # will be deprecated in the future + return_token_ids: bool | None = False + include_stop_str_in_output: bool | None = False chat_template_kwargs: dict[str, Any] | None = Field( default=None, description=('Additional keyword args to pass to the template renderer. ' 'Will be accessible by the chat template.'), ) # kwargs for hf processor - mm_processor_kwargs: Optional[dict[str, Any]] = Field( + mm_processor_kwargs: dict[str, Any] | None = Field( default=None, description=('Additional kwargs to pass to the HF processor'), ) @@ -182,51 +182,51 @@ class ExtractedToolCallInformation(BaseModel): # indicate if tools were called tools_called: bool # extracted tool calls - tool_calls: List[ToolCall] + tool_calls: list[ToolCall] # content - per OpenAI spec, content AND tool calls can be returned rarely # But some models will do this intentionally - content: Optional[str] = None + content: str | None = None class ChatMessage(BaseModel): """Chat messages.""" role: str - content: Optional[str] = None - gen_tokens: Optional[List[int]] = None - reasoning_content: Optional[str] = Field(default=None, examples=[None]) - tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None]) + content: str | None = None + gen_tokens: list[int] | None = None + reasoning_content: str | None = Field(default=None, examples=[None]) + tool_calls: list[ToolCall] | None = Field(default=None, examples=[None]) class LogProbs(BaseModel): - text_offset: List[int] = Field(default_factory=list) - token_logprobs: List[Optional[float]] = Field(default_factory=list) - tokens: List[str] = Field(default_factory=list) - top_logprobs: Optional[List[Optional[Dict[str, float]]]] = None + text_offset: list[int] = Field(default_factory=list) + token_logprobs: list[float | None] = Field(default_factory=list) + tokens: list[str] = Field(default_factory=list) + top_logprobs: list[dict[str, float] | None] | None = None class TopLogprob(BaseModel): token: str - bytes: Optional[List[int]] = None + bytes: list[int] | None = None logprob: float class ChatCompletionTokenLogprob(BaseModel): token: str - bytes: Optional[List[int]] = None + bytes: list[int] | None = None logprob: float - top_logprobs: List[TopLogprob] + top_logprobs: list[TopLogprob] class ChoiceLogprobs(BaseModel): - content: Optional[List[ChatCompletionTokenLogprob]] = None + content: list[ChatCompletionTokenLogprob] | None = None class ChatCompletionResponseChoice(BaseModel): """Chat completion response choices.""" index: int message: ChatMessage - logprobs: Optional[ChoiceLogprobs] = None - finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None + logprobs: ChoiceLogprobs | None = None + finish_reason: Literal['stop', 'length', 'tool_calls', 'error', 'abort'] | None = None class ChatCompletionResponse(BaseModel): @@ -235,13 +235,13 @@ class ChatCompletionResponse(BaseModel): object: str = 'chat.completion' created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[ChatCompletionResponseChoice] + choices: list[ChatCompletionResponseChoice] usage: UsageInfo class DeltaFunctionCall(BaseModel): - name: Optional[str] = None - arguments: Optional[str] = None + name: str | None = None + arguments: str | None = None # a tool call delta where everything is optional @@ -249,24 +249,24 @@ class DeltaToolCall(BaseModel): id: str = Field(default_factory=lambda: f'chatcmpl-tool-{shortuuid.random()}') type: Literal['function'] = 'function' index: int - function: Optional[DeltaFunctionCall] = None + function: DeltaFunctionCall | None = None class DeltaMessage(BaseModel): """Delta messages.""" - role: Optional[str] = None - content: Optional[str] = None - reasoning_content: Optional[str] = None - gen_tokens: Optional[List[int]] = None - tool_calls: List[DeltaToolCall] = Field(default_factory=list) + role: str | None = None + content: str | None = None + reasoning_content: str | None = None + gen_tokens: list[int] | None = None + tool_calls: list[DeltaToolCall] = Field(default_factory=list) class ChatCompletionResponseStreamChoice(BaseModel): """Chat completion response stream choice.""" index: int delta: DeltaMessage - logprobs: Optional[ChoiceLogprobs] = None - finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None + logprobs: ChoiceLogprobs | None = None + finish_reason: Literal['stop', 'length', 'tool_calls', 'error', 'abort'] | None = None class ChatCompletionStreamResponse(BaseModel): @@ -275,56 +275,56 @@ class ChatCompletionStreamResponse(BaseModel): object: str = 'chat.completion.chunk' created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[ChatCompletionResponseStreamChoice] - usage: Optional[UsageInfo] = None + choices: list[ChatCompletionResponseStreamChoice] + usage: UsageInfo | None = None class CompletionRequest(BaseModel): """Completion request.""" model: str - prompt: Union[str, List[Any]] - suffix: Optional[str] = None - temperature: Optional[float] = 0.7 - n: Optional[int] = 1 - logprobs: Optional[int] = None - max_completion_tokens: Optional[int] = Field( + prompt: str | list[Any] + suffix: str | None = None + temperature: float | None = 0.7 + n: int | None = 1 + logprobs: int | None = None + max_completion_tokens: int | None = Field( default=None, examples=[None], description=('An upper bound for the number of tokens that can be generated for a completion, ' 'including visible output tokens and reasoning tokens'), ) - max_tokens: Optional[int] = Field( + max_tokens: int | None = Field( default=16, examples=[16], deprecated='max_tokens is deprecated in favor of the max_completion_tokens field', ) - stop: Optional[Union[str, List[str]]] = Field(default=None, examples=[None]) - stream: Optional[bool] = False - stream_options: Optional[StreamOptions] = Field(default=None, examples=[None]) - top_p: Optional[float] = 1.0 - echo: Optional[bool] = False - presence_penalty: Optional[float] = 0.0 - frequency_penalty: Optional[float] = 0.0 - user: Optional[str] = None + stop: str | list[str] | None = Field(default=None, examples=[None]) + stream: bool | None = False + stream_options: StreamOptions | None = Field(default=None, examples=[None]) + top_p: float | None = 1.0 + echo: bool | None = False + presence_penalty: float | None = 0.0 + frequency_penalty: float | None = 0.0 + user: str | None = None # additional argument of lmdeploy - repetition_penalty: Optional[float] = 1.0 - session_id: Optional[int] = -1 - ignore_eos: Optional[bool] = False - skip_special_tokens: Optional[bool] = True - spaces_between_special_tokens: Optional[bool] = True - top_k: Optional[int] = 40 # for opencompass - seed: Optional[int] = None + repetition_penalty: float | None = 1.0 + session_id: int | None = -1 + ignore_eos: bool | None = False + skip_special_tokens: bool | None = True + spaces_between_special_tokens: bool | None = True + top_k: int | None = 40 # for opencompass + seed: int | None = None min_p: float = 0.0 - return_token_ids: Optional[bool] = False + return_token_ids: bool | None = False class CompletionResponseChoice(BaseModel): """Completion response choices.""" index: int text: str - logprobs: Optional[LogProbs] = None - gen_tokens: Optional[List[int]] = None - finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None + logprobs: LogProbs | None = None + gen_tokens: list[int] | None = None + finish_reason: Literal['stop', 'length', 'tool_calls', 'error', 'abort'] | None = None class CompletionResponse(BaseModel): @@ -333,7 +333,7 @@ class CompletionResponse(BaseModel): object: str = 'text_completion' created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[CompletionResponseChoice] + choices: list[CompletionResponseChoice] usage: UsageInfo @@ -341,9 +341,9 @@ class CompletionResponseStreamChoice(BaseModel): """Completion response stream choice.""" index: int text: str - logprobs: Optional[LogProbs] = None - gen_tokens: Optional[List[int]] = None - finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None + logprobs: LogProbs | None = None + gen_tokens: list[int] | None = None + finish_reason: Literal['stop', 'length', 'tool_calls', 'error', 'abort'] | None = None class CompletionStreamResponse(BaseModel): @@ -352,21 +352,21 @@ class CompletionStreamResponse(BaseModel): object: str = 'text_completion' created: int = Field(default_factory=lambda: int(time.time())) model: str - choices: List[CompletionResponseStreamChoice] - usage: Optional[UsageInfo] = None + choices: list[CompletionResponseStreamChoice] + usage: UsageInfo | None = None class EmbeddingsRequest(BaseModel): """Embedding request.""" model: str = None - input: Union[str, List[str]] - user: Optional[str] = None + input: str | list[str] + user: str | None = None class EmbeddingsResponse(BaseModel): """Embedding response.""" object: str = 'list' - data: List[Dict[str, Any]] + data: list[dict[str, Any]] model: str usage: UsageInfo @@ -381,11 +381,11 @@ class PoolingRequest(BaseModel): https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/protocol.py#L1174 https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/http_server.py#L383 """ - model: Optional[str] = None - input: Union[List[int], List[List[int]], str, List[str]] + model: str | None = None + input: list[int] | list[list[int]] | str | list[str] encoding_format: Literal['float', 'base64'] = 'float' - dimensions: Optional[int] = None - user: Optional[str] = None + dimensions: int | None = None + user: str | None = None class PoolingResponse(BaseModel): @@ -394,21 +394,21 @@ class PoolingResponse(BaseModel): object: str = 'list' created: int = Field(default_factory=lambda: int(time.time())) model: str = None - data: List[Dict[str, Any]] + data: list[dict[str, Any]] usage: UsageInfo class EncodeRequest(BaseModel): """Encode request.""" - input: Union[str, List[str]] - do_preprocess: Optional[bool] = False - add_bos: Optional[bool] = True + input: str | list[str] + do_preprocess: bool | None = False + add_bos: bool | None = True class EncodeResponse(BaseModel): """Encode response.""" - input_ids: Union[List[int], List[List[int]]] - length: Union[int, List[int]] + input_ids: list[int] | list[list[int]] + length: int | list[int] class GenerateResponse(BaseModel): @@ -417,63 +417,63 @@ class GenerateResponse(BaseModel): tokens: int input_tokens: int history_tokens: int - finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None + finish_reason: Literal['stop', 'length', 'tool_calls', 'error', 'abort'] | None = None class UpdateParamsRequest(BaseModel): """Update weights request.""" - serialized_named_tensors: Union[str, List[str], Dict] - load_format: Optional[str] = None # 'flattened_bucket' or None + serialized_named_tensors: str | list[str] | dict + load_format: str | None = None # 'flattened_bucket' or None finished: bool = False # str for url/base64, base64 should be data:image/jpeg;base64, dict should be {'url': url/base64, 'options': ...} -ImageDataInputItem = Union[str, Dict] -ImageDataFormat = Union[ImageDataInputItem, List[ImageDataInputItem]] +ImageDataInputItem = str | dict +ImageDataFormat = ImageDataInputItem | list[ImageDataInputItem] # /generate input class GenerateReqInput(BaseModel): - session_id: Optional[int] = -1 - prompt: Optional[str] = None - input_ids: Optional[List[int]] = None - image_data: Optional[ImageDataFormat] = None - return_logprob: Optional[bool] = None + session_id: int | None = -1 + prompt: str | None = None + input_ids: list[int] | None = None + image_data: ImageDataFormat | None = None + return_logprob: bool | None = None max_tokens: int = 128 - stop: Optional[Union[str, List[str]]] = None - stop_token_ids: Optional[List[int]] = None - stream: Optional[bool] = False + stop: str | list[str] | None = None + stop_token_ids: list[int] | None = None + stream: bool | None = False temperature: float = 1.0 - repetition_penalty: Optional[float] = 1.0 - ignore_eos: Optional[bool] = False + repetition_penalty: float | None = 1.0 + ignore_eos: bool | None = False top_p: float = 1.0 top_k: int = 0 min_p: float = 0.0 - skip_special_tokens: Optional[bool] = True - spaces_between_special_tokens: Optional[bool] = True - include_stop_str_in_output: Optional[bool] = False - return_routed_experts: Optional[bool] = False + skip_special_tokens: bool | None = True + spaces_between_special_tokens: bool | None = True + include_stop_str_in_output: bool | None = False + return_routed_experts: bool | None = False repetition_ngram_size: int = 0 repetition_ngram_threshold: int = 0 # kwargs for hf processor - mm_processor_kwargs: Optional[dict[str, Any]] = Field( + mm_processor_kwargs: dict[str, Any] | None = Field( default=None, description=('Additional kwargs to pass to the HF processor'), ) class GenerateReqMetaOutput(BaseModel): - prompt_tokens: Optional[int] = None - completion_tokens: Optional[int] = None - finish_reason: Optional[Dict[str, Any]] = None - output_token_logprobs: Optional[List[tuple[float, int]]] = None # (logprob, token_id) - routed_experts: Optional[Union[List[List[List[int]]], str]] = None # (num_token, num_layer, topk_expert) + prompt_tokens: int | None = None + completion_tokens: int | None = None + finish_reason: dict[str, Any] | None = None + output_token_logprobs: list[tuple[float, int]] | None = None # (logprob, token_id) + routed_experts: list[list[list[int]]] | str | None = None # (num_token, num_layer, topk_expert) # /generate output class GenerateReqOutput(BaseModel): text: str - output_ids: List[int] + output_ids: list[int] meta_info: GenerateReqMetaOutput @@ -481,7 +481,7 @@ class AbortRequest(BaseModel): # Whether to abort all requests abort_all: bool = False # The finished reason data - finished_reason: Optional[Dict[str, Any]] = None - abort_message: Optional[str] = None + finished_reason: dict[str, Any] | None = None + abort_message: str | None = None # The session ID to abort. If `abort_all` is True, this field is ignored. - session_id: Optional[int] = -1 + session_id: int | None = -1 diff --git a/lmdeploy/serve/openai/reasoning_parser/deepseek_r1_reasoning_parser.py b/lmdeploy/serve/openai/reasoning_parser/deepseek_r1_reasoning_parser.py index a6b7e3a602..d2392648e4 100644 --- a/lmdeploy/serve/openai/reasoning_parser/deepseek_r1_reasoning_parser.py +++ b/lmdeploy/serve/openai/reasoning_parser/deepseek_r1_reasoning_parser.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. # modified from https://github.com/vllm-project/vllm/tree/v0.7.3/vllm/entrypoints/openai/reasoning_parsers import re -from typing import Optional, Sequence, Tuple, Union +from collections.abc import Sequence from lmdeploy.serve.openai.protocol import ChatCompletionRequest, DeltaMessage @@ -42,7 +42,7 @@ def extract_reasoning_content_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], **kwargs, - ) -> Union[DeltaMessage, None]: + ) -> DeltaMessage | None: """Instance method that should be implemented for extracting reasoning from an incomplete response; for use when handling reasoning calls and streaming. @@ -105,7 +105,7 @@ def extract_reasoning_content_streaming( return DeltaMessage(reasoning_content=delta_text) def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest, - **kwargs) -> Tuple[Optional[str], Optional[str]]: + **kwargs) -> tuple[str | None, str | None]: """Extract reasoning content from a complete model-generated string. Used for non-streaming responses where we have the entire model response diff --git a/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py b/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py index 3d5b792dc1..63f35d76e6 100644 --- a/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py +++ b/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import re -from typing import Optional, Sequence, Tuple, Union +from collections.abc import Sequence from lmdeploy.serve.openai.protocol import ChatCompletionRequest, DeltaMessage @@ -35,7 +35,7 @@ def extract_reasoning_content_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], **kwargs, - ) -> Union[DeltaMessage, None]: + ) -> DeltaMessage | None: """Instance method that should be implemented for extracting reasoning from an incomplete response; for use when handling reasoning calls and streaming. @@ -95,7 +95,7 @@ def extract_reasoning_content_streaming( return DeltaMessage(reasoning_content=delta_text) def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest, - **kwargs) -> Tuple[Optional[str], Optional[str]]: + **kwargs) -> tuple[str | None, str | None]: """Extract reasoning content from a complete model-generated string. Used for non-streaming responses where we have the entire model response diff --git a/lmdeploy/serve/openai/reasoning_parser/reasoning_parser.py b/lmdeploy/serve/openai/reasoning_parser/reasoning_parser.py index f224dba0a5..7abb62069d 100644 --- a/lmdeploy/serve/openai/reasoning_parser/reasoning_parser.py +++ b/lmdeploy/serve/openai/reasoning_parser/reasoning_parser.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. # modified from https://github.com/vllm-project/vllm/tree/v0.7.3/vllm/entrypoints/openai/reasoning_parsers +from collections.abc import Sequence from functools import cached_property -from typing import Dict, Optional, Sequence, Tuple, Union from mmengine import Registry @@ -16,7 +16,7 @@ def __init__(self, tokenizer: object): self.model_tokenizer = tokenizer @cached_property - def vocab(self) -> Dict[str, int]: + def vocab(self) -> dict[str, int]: # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab # whereas all tokenizers have .get_vocab() return self.model_tokenizer.get_vocab() @@ -30,7 +30,7 @@ def extract_reasoning_content_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], **kwargs, - ) -> Union[DeltaMessage, None]: + ) -> DeltaMessage | None: """Instance method that should be implemented for extracting reasoning from an incomplete response; for use when handling reasoning calls and streaming. @@ -42,7 +42,7 @@ def extract_reasoning_content_streaming( 'has not been implemented!') def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest, - **kwargs) -> Tuple[Optional[str], Optional[str]]: + **kwargs) -> tuple[str | None, str | None]: """Extract reasoning content from a complete model-generated string. Used for non-streaming responses where we have the entire model response diff --git a/lmdeploy/serve/openai/tool_parser/internlm2_parser.py b/lmdeploy/serve/openai/tool_parser/internlm2_parser.py index e104511d76..89e2bb471e 100644 --- a/lmdeploy/serve/openai/tool_parser/internlm2_parser.py +++ b/lmdeploy/serve/openai/tool_parser/internlm2_parser.py @@ -1,14 +1,21 @@ # Copyright (c) OpenMMLab. All rights reserved. # modified from https://github.com/vllm-project/vllm/tree/v0.7.3/vllm/entrypoints/openai/tool_parsers import json -from typing import Dict, Sequence, Union +from collections.abc import Sequence import partial_json_parser import shortuuid from partial_json_parser.core.options import Allow -from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, - ExtractedToolCallInformation, FunctionCall, ToolCall) +from lmdeploy.serve.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) from lmdeploy.utils import get_logger from .tool_parser import ToolParser, ToolParserManager @@ -48,7 +55,7 @@ def extract_tool_calls_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: ChatCompletionRequest, - ) -> Union[DeltaMessage, None]: + ) -> DeltaMessage | None: if '<|action_start|>' not in current_text: self.position = len(current_text) return DeltaMessage(content=delta_text) @@ -84,7 +91,7 @@ def extract_tool_calls_streaming( # tool calls are generated in an object in inernlm2 # it's not support parallel tool calls try: - tool_call_arr: Dict = partial_json_parser.loads(parsable_arr, flags) + tool_call_arr: dict = partial_json_parser.loads(parsable_arr, flags) except partial_json_parser.core.exceptions.MalformedJSON: logger.debug('not enough tokens to parse into JSON yet') return None diff --git a/lmdeploy/serve/openai/tool_parser/llama3_parser.py b/lmdeploy/serve/openai/tool_parser/llama3_parser.py index 1c4eaf35d6..445cad312f 100644 --- a/lmdeploy/serve/openai/tool_parser/llama3_parser.py +++ b/lmdeploy/serve/openai/tool_parser/llama3_parser.py @@ -1,14 +1,21 @@ # Copyright (c) OpenMMLab. All rights reserved. import json import re -from typing import Dict, List, Sequence, Union +from collections.abc import Sequence import partial_json_parser import shortuuid from partial_json_parser.core.options import Allow -from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, - ExtractedToolCallInformation, FunctionCall, ToolCall) +from lmdeploy.serve.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) from lmdeploy.utils import get_logger from .tool_parser import ToolParser, ToolParserManager @@ -30,10 +37,10 @@ def __init__(self, tokenizer: object): # initialize properties used for state when parsing tool calls in # streaming mode - self.prev_tool_call_arr: List[Dict] = [] + self.prev_tool_call_arr: list[dict] = [] self.current_tool_id: int = -1 self.current_tool_name_sent: bool = False - self.streamed_args_for_tool: List[str] = [] # map what has been streamed for each tool so far to a list + self.streamed_args_for_tool: list[str] = [] # map what has been streamed for each tool so far to a list self.bot_token = '<|python_tag|>' self.bot_token_id = tokenizer.encode(self.bot_token, add_special_tokens=False)[0] self.tool_call_regex = re.compile(r'\[{.*?}\]', re.DOTALL) @@ -48,7 +55,7 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest) name = action.split('{')[0] call_info_list = [(name, parameters)] - tool_calls: List[ToolCall] = [ + tool_calls: list[ToolCall] = [ ToolCall(type='function', function=FunctionCall(name=name, arguments=arguments)) for name, arguments in call_info_list ] @@ -71,7 +78,7 @@ def extract_tool_calls_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: ChatCompletionRequest, - ) -> Union[DeltaMessage, None]: + ) -> DeltaMessage | None: if not (current_text.startswith(self.bot_token) or current_text.startswith('{')): return DeltaMessage(content=delta_text) @@ -105,7 +112,7 @@ def extract_tool_calls_streaming( return None # select as the current tool call the one we're on the state at - current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + current_tool_call: dict = tool_call_arr[self.current_tool_id] \ if len(tool_call_arr) > 0 else {} # case -- if no tokens have been streamed for the tool, e.g. diff --git a/lmdeploy/serve/openai/tool_parser/qwen2d5_parser.py b/lmdeploy/serve/openai/tool_parser/qwen2d5_parser.py index 9cd68b04e4..eb87d1f97a 100644 --- a/lmdeploy/serve/openai/tool_parser/qwen2d5_parser.py +++ b/lmdeploy/serve/openai/tool_parser/qwen2d5_parser.py @@ -1,14 +1,21 @@ # Copyright (c) OpenMMLab. All rights reserved. import json import re -from typing import Dict, Sequence, Union +from collections.abc import Sequence import partial_json_parser import shortuuid from partial_json_parser.core.options import Allow -from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, - ExtractedToolCallInformation, FunctionCall, ToolCall) +from lmdeploy.serve.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) from lmdeploy.utils import get_logger from .tool_parser import ToolParser, ToolParserManager @@ -43,7 +50,7 @@ def extract_tool_calls_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: ChatCompletionRequest, - ) -> Union[DeltaMessage, None]: + ) -> DeltaMessage | None: if self.tool_start_token not in current_text: self.position = len(current_text) return DeltaMessage(content=delta_text) @@ -79,7 +86,7 @@ def extract_tool_calls_streaming( # tool calls are generated in an object in inernlm2 # it's not support parallel tool calls try: - tool_call_arr: Dict = partial_json_parser.loads(parsable_arr, flags) + tool_call_arr: dict = partial_json_parser.loads(parsable_arr, flags) except partial_json_parser.core.exceptions.MalformedJSON: logger.debug('not enough tokens to parse into JSON yet') return None diff --git a/lmdeploy/serve/openai/tool_parser/qwen3_parser.py b/lmdeploy/serve/openai/tool_parser/qwen3_parser.py index f1a9635d6c..4b04410461 100644 --- a/lmdeploy/serve/openai/tool_parser/qwen3_parser.py +++ b/lmdeploy/serve/openai/tool_parser/qwen3_parser.py @@ -1,13 +1,20 @@ # Copyright (c) OpenMMLab. All rights reserved. import json import re +from collections.abc import Sequence from dataclasses import dataclass -from typing import Dict, Optional, Sequence, Union import shortuuid -from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, - ExtractedToolCallInformation, FunctionCall, ToolCall) +from lmdeploy.serve.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) from lmdeploy.utils import get_logger from .tool_parser import ToolParser, ToolParserManager @@ -16,7 +23,7 @@ @dataclass -class ParserState(object): +class ParserState: """Maintains the state of parsing during tool call extraction.""" position: int = 0 # Current position in the text being parsed current_index: int = -1 # Index of the current tool call @@ -77,14 +84,14 @@ def _split(self, parser_state: ParserState, parsing_content: str): parser_state.position += (end_idx - start_idx) + len(self.tool_end_token) return parsing_content[:start_idx], parsing_content[start_idx + len(self.tool_start_token):end_idx], True - def _parse_delta_tool_call(self, parser_state: ParserState, tool_content: str) -> Optional[DeltaToolCall]: + def _parse_delta_tool_call(self, parser_state: ParserState, tool_content: str) -> DeltaToolCall | None: """Parse tool content into a DeltaToolCall object. This method handles parsing tool calls only when it's a valid tool """ parsable_arr = tool_content.strip() try: - tool_call_arr: Dict = json.loads(parsable_arr) + tool_call_arr: dict = json.loads(parsable_arr) except json.JSONDecodeError: logger.debug('cannot parse into JSON yet') return @@ -119,7 +126,7 @@ def extract_tool_calls_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: ChatCompletionRequest, - ) -> Union[DeltaMessage, None]: + ) -> DeltaMessage | None: """Extract tool calls from streaming model output. This method processes incremental model output to extract tool calls, reasoning content, and regular text diff --git a/lmdeploy/serve/openai/tool_parser/qwen3coder_parser.py b/lmdeploy/serve/openai/tool_parser/qwen3coder_parser.py index 24ee53c7a8..fad17871fd 100644 --- a/lmdeploy/serve/openai/tool_parser/qwen3coder_parser.py +++ b/lmdeploy/serve/openai/tool_parser/qwen3coder_parser.py @@ -1,13 +1,21 @@ # Copyright (c) OpenMMLab. All rights reserved. import json import re +from collections.abc import Sequence from dataclasses import dataclass -from typing import Any, Dict, Optional, Sequence, Tuple, Union +from typing import Any import shortuuid -from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, - ExtractedToolCallInformation, FunctionCall, ToolCall) +from lmdeploy.serve.openai.protocol import ( + ChatCompletionRequest, + DeltaFunctionCall, + DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + ToolCall, +) from lmdeploy.utils import get_logger from .tool_parser import ToolParser, ToolParserManager @@ -16,7 +24,7 @@ @dataclass -class ParserState(object): +class ParserState: """Maintains the state of parsing during tool call extraction.""" position: int = 0 # Current position in the text being parsed current_index: int = -1 # Index of the current tool call @@ -48,7 +56,7 @@ def __init__(self, tokenizer: object): self.tool_call_pat = re.compile(r'\n*(.*?)', re.DOTALL) - def _split(self, parser_state: ParserState, parsing_content: str) -> Tuple[str, str, bool]: + def _split(self, parser_state: ParserState, parsing_content: str) -> tuple[str, str, bool]: """Split content into tuple: (text_content, tool_content, has_tool_end)""" try: start_idx = parsing_content.index(self.tool_start_token) @@ -66,7 +74,7 @@ def _split(self, parser_state: ParserState, parsing_content: str) -> Tuple[str, parser_state.position += rem + len(self.tool_end_token) return parsing_content[:start_idx], parsing_content[start_idx:end_idx + len(self.tool_end_token)], True - def _extract_params(self, content: str) -> Tuple[Optional[str], Dict[str, Any], bool]: + def _extract_params(self, content: str) -> tuple[str | None, dict[str, Any], bool]: """Parse XML tool content into components.""" content = content.replace(self.tool_start_token, '').replace(self.tool_end_token, '').strip() @@ -126,7 +134,7 @@ def extract_tool_calls_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: ChatCompletionRequest, - ) -> Union[DeltaMessage, None]: + ) -> DeltaMessage | None: parser_state = getattr(request, '_tool_parser_state', None) if parser_state is None: diff --git a/lmdeploy/serve/openai/tool_parser/tool_parser.py b/lmdeploy/serve/openai/tool_parser/tool_parser.py index 89ed8091ce..f919d33ef7 100644 --- a/lmdeploy/serve/openai/tool_parser/tool_parser.py +++ b/lmdeploy/serve/openai/tool_parser/tool_parser.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. # modified from https://github.com/vllm-project/vllm/tree/v0.7.3/vllm/entrypoints/openai/tool_parsers +from collections.abc import Sequence from functools import cached_property -from typing import Dict, List, Sequence, Union from mmengine import Registry @@ -19,16 +19,16 @@ class ToolParser: """ def __init__(self, tokenizer: object): - self.prev_tool_call_arr: List[Dict] = [] + self.prev_tool_call_arr: list[dict] = [] # the index of the tool call that is currently being parsed self.current_tool_id: int = -1 self.current_tool_name_sent: bool = False - self.streamed_args_for_tool: List[str] = [] + self.streamed_args_for_tool: list[str] = [] self.model_tokenizer = tokenizer @cached_property - def vocab(self) -> Dict[str, int]: + def vocab(self) -> dict[str, int]: # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab # whereas all tokenizers have .get_vocab() return self.model_tokenizer.get_vocab() @@ -55,7 +55,7 @@ def extract_tool_calls_streaming( current_token_ids: Sequence[int], delta_token_ids: Sequence[int], request: ChatCompletionRequest, - ) -> Union[DeltaMessage, None]: + ) -> DeltaMessage | None: """Instance method that should be implemented for extracting tool calls from an incomplete response; for use when handling tool calls and streaming. diff --git a/lmdeploy/serve/openai/tool_parser/utils.py b/lmdeploy/serve/openai/tool_parser/utils.py index a97dc393aa..bee4728d8c 100644 --- a/lmdeploy/serve/openai/tool_parser/utils.py +++ b/lmdeploy/serve/openai/tool_parser/utils.py @@ -3,7 +3,7 @@ import json from json import JSONDecodeError, JSONDecoder -from typing import Any, List, Tuple +from typing import Any import partial_json_parser from partial_json_parser.core.options import Allow @@ -77,7 +77,7 @@ def extract_intermediate_diff(curr: str, old: str) -> str: return diff -def find_all_indices(string: str, substring: str) -> List[int]: +def find_all_indices(string: str, substring: str) -> list[int]: """Find all (starting) indices of a substring in a given string. Useful for tool call extraction @@ -94,7 +94,7 @@ def find_all_indices(string: str, substring: str) -> List[int]: # partial_json_parser doesn't support extra data and # JSONDecorder.raw_decode doesn't support partial JSON -def partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]: +def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]: try: return (partial_json_parser.loads(input_str, flags), len(input_str)) except JSONDecodeError as e: diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py index 2ea599e687..8505448d1b 100644 --- a/lmdeploy/serve/processors/multimodal.py +++ b/lmdeploy/serve/processors/multimodal.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import asyncio -from typing import Any, Dict, List, Literal, Tuple +from typing import Any, Literal import PIL @@ -34,7 +34,7 @@ def __init__(self, self.backend = backend @staticmethod - def merge_message_content(msg: Dict) -> Dict: + def merge_message_content(msg: dict) -> dict: """Merge multimodal content blocks and ensure content field exists. This function normalizes message content to match vLLM's behavior: @@ -85,14 +85,14 @@ def merge_message_content(msg: Dict) -> Dict: return result @staticmethod - async def async_convert_multimodal_data(messages: List[Dict]) -> List[Dict]: + async def async_convert_multimodal_data(messages: list[dict]) -> list[dict]: """Convert user-input multimodal data into GPT4V message format.""" from lmdeploy.vl.time_series_utils import load_time_series from lmdeploy.vl.utils import load_image - if isinstance(messages, Dict): + if isinstance(messages, dict): messages = [messages] - assert isinstance(messages, List) + assert isinstance(messages, list) out_messages = [None] * len(messages) @@ -108,7 +108,7 @@ def _inner_call(i, in_messages, out_messages): return # the role is a user and the content is a list, in which there # might be image_url or image_data - assert isinstance(content, List) + assert isinstance(content, list) message = dict(role=role, content=[]) for item in content: # image url or base64-encoded image data @@ -205,14 +205,14 @@ def _inner_call(i, in_messages, out_messages): return out_messages async def get_prompt_input(self, - prompt: str | List[Dict], + prompt: str | list[dict], do_preprocess: bool, sequence_start: bool, adapter_name: str, - tools: List[object] | None = None, + tools: list[object] | None = None, reasoning_effort: Literal['low', 'medium', 'high'] | None = None, - chat_template_kwargs: Dict | None = None, - mm_processor_kwargs: Dict[str, Any] | None = None, + chat_template_kwargs: dict | None = None, + mm_processor_kwargs: dict[str, Any] | None = None, **kwargs): """Process prompt and return prompt string and input_ids. @@ -231,7 +231,7 @@ async def get_prompt_input(self, **kwargs: Additional keyword arguments. Returns: - Dict with 'prompt' (str) and 'input_ids' (List[int]) keys for text-only, + dict with 'prompt' (str) and 'input_ids' (list[int]) keys for text-only, or dict with multimodal data for multimodal prompts. """ # Handle string input @@ -274,7 +274,7 @@ async def get_prompt_input(self, raise RuntimeError(f'unsupported prompt type: {type(prompt)}') @staticmethod - def format_prompts(prompts: Any) -> List[Dict]: + def format_prompts(prompts: Any) -> list[dict]: """Format prompts.""" if not isinstance(prompts, list): prompts = [prompts] @@ -318,7 +318,7 @@ def _is_image_list(obj) -> bool: return isinstance(obj, list) and all(MultimodalProcessor._is_image(img) for img in obj) @staticmethod - def _re_format_prompt_images_pair(prompt: Tuple) -> Dict: + def _re_format_prompt_images_pair(prompt: tuple) -> dict: """Reformat the prompt to openai message format.""" from lmdeploy.vl.utils import load_image @@ -350,7 +350,7 @@ def _re_format_prompt_images_pair(prompt: Tuple) -> Dict: messages['content'].append({'type': 'text', 'text': prompt}) return messages - def _has_multimodal_input(self, messages: List[Dict]) -> bool: + def _has_multimodal_input(self, messages: list[dict]) -> bool: """Check if messages contain multimodal input (images).""" multimodal_types = ['image_url', 'image_data', 'time_series_url'] return any( @@ -358,13 +358,13 @@ def _has_multimodal_input(self, messages: List[Dict]) -> bool: item.get('type') in multimodal_types for item in message['content']) for message in messages) async def _get_text_prompt_input(self, - prompt: str | List[Dict], + prompt: str | list[dict], do_preprocess: bool, sequence_start: bool, adapter_name: str, - tools: List[object] | None = None, + tools: list[object] | None = None, reasoning_effort: Literal['low', 'medium', 'high'] | None = None, - chat_template_kwargs: Dict | None = None, + chat_template_kwargs: dict | None = None, **kwargs): """Process text-only prompt and return prompt string and input_ids.""" # Change multimodal data to openai text messages @@ -391,13 +391,13 @@ async def _get_text_prompt_input(self, return {'prompt': prompt, 'input_ids': input_ids} async def _get_multimodal_prompt_input(self, - messages: List[Dict], + messages: list[dict], do_preprocess: bool, sequence_start: bool, adapter_name: str, - tools: List[object] | None = None, - chat_template_kwargs: Dict | None = None, - mm_processor_kwargs: Dict[str, Any] | None = None, + tools: list[object] | None = None, + chat_template_kwargs: dict | None = None, + mm_processor_kwargs: dict[str, Any] | None = None, **kwargs): """Process multimodal prompt and return processed data for inference engines.""" diff --git a/lmdeploy/serve/proxy/proxy.py b/lmdeploy/serve/proxy/proxy.py index b47a0cd6c2..667886273e 100644 --- a/lmdeploy/serve/proxy/proxy.py +++ b/lmdeploy/serve/proxy/proxy.py @@ -10,7 +10,7 @@ import time from collections import deque from http import HTTPStatus -from typing import Deque, Literal +from typing import Literal import aiohttp import numpy as np @@ -26,8 +26,13 @@ from lmdeploy.pytorch.disagg.conn.proxy_conn import PDConnectionPool from lmdeploy.pytorch.disagg.messages import PDConnectionMessage from lmdeploy.serve.openai.api_server import create_error_response -from lmdeploy.serve.openai.protocol import ModelCard # noqa: E501 -from lmdeploy.serve.openai.protocol import ChatCompletionRequest, CompletionRequest, ModelList, ModelPermission +from lmdeploy.serve.openai.protocol import ( + ChatCompletionRequest, + CompletionRequest, + ModelCard, # noqa: E501 + ModelList, + ModelPermission, +) from lmdeploy.serve.proxy.utils import AIOHTTP_TIMEOUT, LATENCY_DEQUE_LEN, ErrorCodes, RoutingStrategy, err_msg from lmdeploy.serve.utils.server_utils import validate_json_request from lmdeploy.utils import get_logger @@ -43,7 +48,7 @@ class Status(BaseModel): role: EngineRole = EngineRole.Hybrid models: list[str] = Field(default=[], examples=[[]]) unfinished: int = 0 - latency: Deque = Field(default=deque(maxlen=LATENCY_DEQUE_LEN), examples=[[]]) + latency: deque = Field(default=deque(maxlen=LATENCY_DEQUE_LEN), examples=[[]]) speed: int | None = Field(default=None, examples=[None]) @@ -96,7 +101,7 @@ def __init__(self, if config_path is not None: self.config_path = config_path if osp.exists(self.config_path) and self.cache_status: - with open(self.config_path, 'r') as config_file: + with open(self.config_path) as config_file: if os.path.getsize(self.config_path) > 0: logger.info(f'loading node configuration: {self.config_path}') config = json.load(config_file) @@ -150,7 +155,7 @@ def add(self, node_url: str, status: Status | None = None): Args: node_url (str): A http url. Can be the url generated by `lmdeploy serve api_server`. - description (Dict): The description of the node. An example: + description (dict): The description of the node. An example: {'http://0.0.0.0:23333': {models: ['internlm-chat-7b]}, speed: -1}. The speed here can be RPM or other metric. All the values of nodes should be the same metric. @@ -345,7 +350,7 @@ async def stream_generate(self, request: dict, node_url: str, endpoint: str): """Return a generator to handle the input request. Args: - request (Dict): the input request. + request (dict): the input request. node_url (str): the node url. endpoint (str): the endpoint. Such as `/v1/chat/completions`. """ @@ -364,7 +369,7 @@ async def generate(self, request: dict, node_url: str, endpoint: str): """Return a the response of the input request. Args: - request (Dict): the input request. + request (dict): the input request. node_url (str): the node url. endpoint (str): the endpoint. Such as `/v1/chat/completions`. """ @@ -490,7 +495,7 @@ def add_node(node: Node, raw_request: Request = None): - **url** (str): A http url. Can be the url generated by `lmdeploy serve api_server`. - - **status** (Dict): The description of the node. An example: + - **status** (dict): The description of the node. An example: ``{models: ['internlm-chat-7b], speed: 1}``. The speed here can be RPM or other metric. All the values of nodes should be the same metric. """ @@ -589,9 +594,9 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque Deprecated: Use max_completion_tokens instead. - **repetition_penalty** (float): The parameter for repetition penalty. 1.0 means no penalty - - **stop** (str | List[str] | None): To stop generating further + - **stop** (str | list[str] | None): To stop generating further tokens. Only accept stop words that's encoded to one token idex. - - **response_format** (Dict | None): To generate response according to given + - **response_format** (dict | None): To generate response according to given schema. Examples: .. code-block:: json @@ -612,8 +617,8 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque or ``{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}`` - - **logit_bias** (Dict): Bias to logits. Only supported in pytorch engine. - - **tools** (List): A list of tools the model may call. Currently, only + - **logit_bias** (dict): Bias to logits. Only supported in pytorch engine. + - **tools** (list): A list of tools the model may call. Currently, only internlm2 functions are supported as a tool. Use this to specify a list of functions for which the model can generate JSON inputs. - **tool_choice** (str | object): Controls which (if any) tool is called by @@ -758,7 +763,7 @@ async def completions_v1(request: CompletionRequest, raw_request: Request = None - **repetition_penalty** (float): The parameter for repetition penalty. 1.0 means no penalty - **user** (str): A unique identifier representing your end-user. - - **stop** (str | List[str] | None): To stop generating further + - **stop** (str | list[str] | None): To stop generating further tokens. Only accept stop words that's encoded to one token idex. Additional arguments supported by LMDeploy: @@ -898,7 +903,7 @@ def proxy(server_name: str = '0.0.0.0', route_strategy ('random' | 'min_expected_latency' | 'min_observed_latency'): the strategy to dispatch requests to nodes. Default to 'min_expected_latency' - api_keys (List[str] | str | None): Optional list of API keys. Accepts string type as + api_keys (list[str] | str | None): Optional list of API keys. Accepts string type as a single api_key. Default to None, which means no api key applied. ssl (bool): Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'. log_level (str): Set the log level. Default to INFO. diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py index 2a4fc407f8..c184e53111 100644 --- a/lmdeploy/tokenizer.py +++ b/lmdeploy/tokenizer.py @@ -2,9 +2,9 @@ import json import os.path as osp from collections import deque +from collections.abc import Sequence from dataclasses import dataclass from functools import partial -from typing import List, Optional, Sequence, Tuple, Union from lmdeploy.utils import get_logger @@ -14,24 +14,24 @@ @dataclass class DetokenizeState: - """A state collection of incrementally detekenization. + """A state collection for incremental detokenization. Args: - ids_offset (int): offset to all input ids. In LMDeploy, the output + ids_offset: offset to all input ids. In LMDeploy, the output ids length is not one by one. It could be random by random. - prev_tokens (List[str] | None): for incrementally decoding. + prev_tokens: for incrementally decoding. Default to None, which means the first round. - prefix_offset (int): the start index of tokens to be converted to + prefix_offset: the start index of tokens to be converted to string (prev + new tokens). Default to 0 for the first round. - read_offset (int): the end index of tokens to be converted to + read_offset: the end index of tokens to be converted to string (prev token). Default to 0 for the first round. """ ids_offset: int = 0 - prev_tokens: Optional[List[str]] = None + prev_tokens: list[str] | None = None prefix_offset: int = 0 read_offset: int = 0 - def as_tuple(self) -> Tuple: + def as_tuple(self) -> tuple: """Return a tuple of states.""" return (self.ids_offset, self.prev_tokens, self.prefix_offset, self.read_offset) @@ -40,7 +40,7 @@ class HuggingFaceTokenizer: """A wrapper of transformers' AutoTokenizer. Args: - model_dir (str): the directory of the tokenizer model + model_dir: the directory of the tokenizer model. """ def __init__(self, model_dir: str): @@ -53,7 +53,7 @@ def __init__(self, model_dir: str): if self.model.eos_token_id is None: generation_config_file = osp.join(model_dir, 'generation_config.json') if osp.exists(generation_config_file): - with open(generation_config_file, 'r') as f: + with open(generation_config_file) as f: cfg = json.load(f) self.model.eos_token_id = cfg['eos_token_id'] elif hasattr(self.model, 'eod_id'): # Qwen remote @@ -129,7 +129,7 @@ def prefix_space_tokens(self): } return self._prefix_space_tokens - def _maybe_add_prefix_space(self, tokens: List[int], decoded: str): + def _maybe_add_prefix_space(self, tokens: list[int], decoded: str): """Maybe add prefix space for incremental decoding.""" if len(tokens) and not decoded.startswith(' ') and\ tokens[0] in self.prefix_space_tokens: @@ -193,13 +193,13 @@ def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, """Tokenize a prompt. Args: - s (str): a prompt - add_bos (bool): Whether to add `bos` token id when encoding - the prompt - add_special_tokens (bool): Whether or not to add special tokens - when encoding the prompt + s: a prompt. + add_bos: Whether to add ``bos`` token id when encoding the prompt. + add_special_tokens: Whether or not to add special tokens + when encoding the prompt. + Returns: - list[int]: token ids + list[int]: token ids. """ encoded = self.model.encode(s, add_special_tokens=add_special_tokens, **kwargs) if not add_bos: @@ -208,17 +208,18 @@ def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, encoded = encoded[1:] return encoded - def decode(self, t: Sequence[int], offset: Optional[int] = None, skip_special_tokens: bool = True): + def decode(self, t: Sequence[int], offset: int | None = None, skip_special_tokens: bool = True): """De-tokenize. Args: - t (List[int]): a list of token ids - offset (int): for incrementally decoding. Default to None, which + t: a list of token ids. + offset: for incrementally decoding. Default to None, which means not applied. - skip_special_tokens (bool): Whether or not to remove special + skip_special_tokens: Whether or not to remove special tokens in the decoding. + Returns: - str: text of decoding tokens + str: text of decoding tokens. """ t = t[offset:] out_string = self.model.decode(t, skip_special_tokens=skip_special_tokens) @@ -232,7 +233,7 @@ def decode(self, t: Sequence[int], offset: Optional[int] = None, skip_special_to @staticmethod def _convert_tokens_to_string_with_added_encoders( tokenizer, - output_tokens: List[str], + output_tokens: list[str], skip_special_tokens: bool, spaces_between_special_tokens: bool, ) -> str: @@ -272,18 +273,18 @@ def detokenize_incrementally(self, """Incrementally detokenize the input indexes. Args: - all_input_ids (List[int]): a list of token ids. Expected to be + all_input_ids: a list of token ids. Expected to be different sections of a long sequence. - state (DetokenizeState): an instance of DetokenizeState. Consists + state: an instance of DetokenizeState. Consists of incrementally decoding states. - skip_special_tokens (bool): Whether or not to remove special tokens + skip_special_tokens: Whether or not to remove special tokens in the decoding. Default to be True. - spaces_between_special_tokens (bool): Whether or not to add spaces + spaces_between_special_tokens: Whether or not to add spaces between special tokens. Default to be True. + Returns: - str: decoding output string of the current round. - state (DetokenizeState): an instance of DetokenizeState. Consists - of incrementally decoding states. + tuple[str, DetokenizeState]: decoding output string of the current + round and the updated DetokenizeState. """ tokenizer = self.model ids_offset, prev_tokens, prefix_offset, read_offset = state.as_tuple() @@ -335,13 +336,14 @@ def detokenize_incrementally(self, return new_text, DetokenizeState(len(all_input_ids), prev_tokens, prefix_offset, read_offset) - def __call__(self, s: Union[str, Sequence[str]]): + def __call__(self, s: str | Sequence[str]): """Tokenize prompts. Args: - s (str): prompts + s: prompts. + Returns: - list[int]: token ids + list[int]: token ids. """ add_special_tokens = False return self.model(s, add_special_tokens=add_special_tokens) @@ -351,7 +353,7 @@ class ChatGLM4Tokenizer(HuggingFaceTokenizer): """Tokenizer of GLM4.""" def __init__(self, model_path): - super(ChatGLM4Tokenizer, self).__init__(model_path) + super().__init__(model_path) original_pad = self.model._pad def __pad(*args, **kwargs): @@ -366,14 +368,14 @@ def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, """Tokenize a prompt.""" # ChtGLM4Tokenizer hardcode `add_speical_tokens=False` when tokenizing # a prompt. Refer to https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/tokenization_chatglm.py#L227 # noqa E501 - return super(ChatGLM4Tokenizer, self).encode(s, add_bos, add_special_tokens=False, **kwargs) + return super().encode(s, add_bos, add_special_tokens=False, **kwargs) class ChatGLMTokenizer(HuggingFaceTokenizer): """Tokenizer of GLM2.""" def __init__(self, model_path): - super(ChatGLMTokenizer, self).__init__(model_path) + super().__init__(model_path) original_pad = self.model._pad def __pad(*args, **kwargs): @@ -389,7 +391,7 @@ class GptOssTokenizer(HuggingFaceTokenizer): """Tokenizer of GPT-OSS.""" def __init__(self, model_dir: str): - super(GptOssTokenizer, self).__init__(model_dir) + super().__init__(model_dir) from openai_harmony import HarmonyEncodingName, Role, StreamableParser, load_harmony_encoding encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS) self.role = Role.ASSISTANT @@ -418,7 +420,7 @@ class Tokenizer: """Tokenize prompts or de-tokenize tokens into texts. Args: - model_path (str): the path of the tokenizer model + model_path: the path of the tokenizer model. """ def __init__(self, model_path: str): @@ -464,13 +466,13 @@ def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, """Tokenize a prompt. Args: - s (str): a prompt - add_bos (bool): Whether to add `bos` token id when encoding - the prompt - add_special_tokens (bool): Whether or not to add special tokens - when encoding the prompt + s: a prompt. + add_bos: Whether to add ``bos`` token id when encoding the prompt. + add_special_tokens: Whether or not to add special tokens + when encoding the prompt. + Returns: - list[int]: token ids + list[int]: token ids. """ encoded = self.model.encode(s, add_bos, add_special_tokens, **kwargs) if encoded[:2] == [self.bos_token_id] * 2: @@ -483,19 +485,20 @@ def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True, def decode( self, t: Sequence[int], - offset: Optional[int] = None, + offset: int | None = None, skip_special_tokens: bool = True, ): """De-tokenize. Args: - t (List[int]): a list of token ids - offset (int): for incrementally decoding. Default to None, which + t: a list of token ids. + offset: for incrementally decoding. Default to None, which means not applied. - skip_special_tokens (bool): Whether or not to remove special + skip_special_tokens: Whether or not to remove special tokens in the decoding. + Returns: - str: text of decoding tokens + str: text of decoding tokens. """ return self.model.decode(t, offset, skip_special_tokens) @@ -507,31 +510,32 @@ def detokenize_incrementally(self, """Incrementally detokenize the input indexes. Args: - all_input_ids (List[int]): a list of token ids. Expected to be + all_input_ids: a list of token ids. Expected to be different sections of a long sequence. - state (DetokenizeState): an instance of DetokenizeState. Consists + state: an instance of DetokenizeState. Consists of incrementally decoding states. - skip_special_tokens (bool): Whether or not to remove special tokens + skip_special_tokens: Whether or not to remove special tokens in the decoding. Default to be True. - spaces_between_special_tokens (bool): Whether or not to add spaces + spaces_between_special_tokens: Whether or not to add spaces between special tokens. Default to be True. + Returns: - str: decoding output string of the current round. - state (DetokenizeState): an instance of DetokenizeState. Consists - of incrementally decoding states. + tuple[str, DetokenizeState]: decoding output string of the current + round and the updated DetokenizeState. """ return self.model.detokenize_incrementally(all_input_ids, state=state, skip_special_tokens=skip_special_tokens, spaces_between_special_tokens=spaces_between_special_tokens) - def __call__(self, s: Union[str, Sequence[str]]): + def __call__(self, s: str | Sequence[str]): """Tokenize prompts. Args: - s (str): prompts + s: prompts. + Returns: - list[int]: token ids + list[int]: token ids. """ return self.model(s) diff --git a/lmdeploy/turbomind/__init__.py b/lmdeploy/turbomind/__init__.py index 318f15dfc2..177274aff9 100644 --- a/lmdeploy/turbomind/__init__.py +++ b/lmdeploy/turbomind/__init__.py @@ -3,20 +3,18 @@ def bootstrap(): import os - import sys has_turbomind = False pwd = os.path.dirname(__file__) if os.path.exists(os.path.join(pwd, '..', 'lib')): has_turbomind = True if os.name == 'nt' and has_turbomind: - if sys.version_info[:2] >= (3, 8): - CUDA_PATH = os.getenv('CUDA_PATH') - assert CUDA_PATH is not None, 'Can not find $env:CUDA_PATH' - dll_path = os.path.join(CUDA_PATH, 'bin') - print(f'Add dll path {dll_path}, please note cuda version ' - 'should >= 11.3 when compiled with cuda 11') - os.add_dll_directory(dll_path) + CUDA_PATH = os.getenv('CUDA_PATH') + assert CUDA_PATH is not None, 'Can not find $env:CUDA_PATH' + dll_path = os.path.join(CUDA_PATH, 'bin') + print(f'Add dll path {dll_path}, please note cuda version ' + 'should >= 11.3 when compiled with cuda 11') + os.add_dll_directory(dll_path) bootstrap() diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index c243554d7d..8fdb95ac78 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -2,7 +2,6 @@ import inspect import json from dataclasses import asdict, field, fields -from typing import List # use pydantic.dataclasses.dataclass to check data type from pydantic.dataclasses import dataclass @@ -55,11 +54,11 @@ class ModelConfig: # of token_embedding embedding_size: int = 0 num_layer: int = None - inter_size: List[int] = None + inter_size: list[int] = None norm_eps: float = None attn_bias: int = 0 mlp_bias: bool = False - window_size: List[int] = field(default_factory=list) + window_size: list[int] = field(default_factory=list) attn_sink: bool = False qk_norm: bool = False size_per_head: int = 128 @@ -73,7 +72,7 @@ class ModelConfig: attn_cp_size: int = 1 mlp_tp_size: int = 1 model_format: str = 'hf' - expert_num: List[int] = () + expert_num: list[int] = field(default_factory=list) expert_router_bias: bool = False expert_inter_size: int = 0 experts_per_token: int = 0 @@ -92,7 +91,7 @@ class ModelConfig: qk_rope_dim: int = 0 v_head_dim: int = 0 # Qwen 3.5 - layer_types: List[str] = field(default_factory=list) + layer_types: list[str] = field(default_factory=list) linear_key_head_dim: int = 0 linear_value_head_dim: int = 0 linear_conv_kernel_dim: int = 0 @@ -102,7 +101,7 @@ class ModelConfig: # Per-layer expert weight type override: layer indices whose # MoE experts are unquantized (fp16) despite expert_weight_type=int4. # Populated from modules_to_not_convert patterns like 'model.layers.0.'. - unquantized_expert_layers: List[int] = field(default_factory=list) + unquantized_expert_layers: list[int] = field(default_factory=list) # tuning tune_layer_num: int = 1 @@ -127,7 +126,7 @@ class RopeParam: low_freq_factor: float = None high_freq_factor: float = None original_max_position_embeddings: int = None - mrope_section: List[int] = None + mrope_section: list[int] = None @dataclass diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/deploy/loader.py index 002c938e27..2475a8a928 100644 --- a/lmdeploy/turbomind/deploy/loader.py +++ b/lmdeploy/turbomind/deploy/loader.py @@ -4,10 +4,10 @@ import re from abc import ABC, abstractmethod from collections import defaultdict +from collections.abc import Iterator from functools import partial from glob import glob from queue import Queue -from typing import Iterator, Tuple, Union import torch from safetensors import safe_open @@ -29,12 +29,12 @@ def __init__(self, model_path: str, pattern, mappings: list): self.item_count = defaultdict(int) self.mappings = mappings - def get_index(self, index_name: str, file_pattern: str) -> Tuple[dict, list]: + def get_index(self, index_name: str, file_pattern: str) -> tuple[dict, list]: """Get shards and weight map (if possible) for the model.""" get_path = partial(osp.join, self.model_path) shards = [] if index_name: - with open(get_path(index_name), 'r') as f: + with open(get_path(index_name)) as f: index = json.load(f) index = index['weight_map'] shards = list(map(get_path, set(index.values()))) @@ -55,7 +55,7 @@ def map_key(self, key: str): return key @abstractmethod - def items(self) -> Iterator[Tuple[int, dict]]: + def items(self) -> Iterator[tuple[int, dict]]: pass @@ -174,7 +174,7 @@ def items(self): self.que.task_done() -def create_loader(model_path: Union[str, Queue], pattern: str, mappings: list) -> BaseLoader: +def create_loader(model_path: str | Queue, pattern: str, mappings: list) -> BaseLoader: args = (model_path, pattern, mappings) if isinstance(model_path, Queue): diff --git a/lmdeploy/turbomind/deploy/policy.py b/lmdeploy/turbomind/deploy/policy.py index 4082df36d3..0e4c061c0d 100644 --- a/lmdeploy/turbomind/deploy/policy.py +++ b/lmdeploy/turbomind/deploy/policy.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List import torch.cuda @@ -8,7 +7,7 @@ def to_cuda(x: torch.Tensor, *args): return x.cuda() -def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]: +def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> list[torch.Tensor]: MAP = {torch.int32: 8, torch.uint8: 2} xs = [] for _ in range(MAP[x.dtype]): diff --git a/lmdeploy/turbomind/deploy/source_model/base.py b/lmdeploy/turbomind/deploy/source_model/base.py index 1de0e54b76..9bc6ca3bbc 100644 --- a/lmdeploy/turbomind/deploy/source_model/base.py +++ b/lmdeploy/turbomind/deploy/source_model/base.py @@ -1,6 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABC, abstractmethod -from typing import Dict, Iterator, Union +from collections.abc import Iterator import torch from mmengine import Registry @@ -14,7 +14,7 @@ class BaseReader(ABC): def __init__(self): pass - def transform(self, x: Union[torch.Tensor, None], kind: str) -> Union[torch.Tensor, None]: + def transform(self, x: torch.Tensor | None, kind: str) -> torch.Tensor | None: return None if x is None else self._transform(x, kind) @abstractmethod @@ -37,7 +37,7 @@ def __init__(self, model_path: str, tokenizer_path: str, **kwargs): self.tokenizer_path = tokenizer_path @abstractmethod - def model_info(self) -> Dict: + def model_info(self) -> dict: """Read model info.""" pass diff --git a/lmdeploy/turbomind/tokenizer_info.py b/lmdeploy/turbomind/tokenizer_info.py index 7b107f3904..d03eca023b 100644 --- a/lmdeploy/turbomind/tokenizer_info.py +++ b/lmdeploy/turbomind/tokenizer_info.py @@ -6,7 +6,6 @@ import json import logging from enum import Enum -from typing import List, Optional, Union import _xgrammar as _xgr # noqa: E402 @@ -71,27 +70,27 @@ class TokenizerInfo(_xgr.TokenizerInfo): def __init__( self, - encoded_vocab: Union[List[bytes], List[str]], + encoded_vocab: list[bytes] | list[str], vocab_type: VocabType = VocabType.RAW, *, - vocab_size: Optional[int] = None, - stop_token_ids: Optional[Union[List[int], int]] = None, + vocab_size: int | None = None, + stop_token_ids: list[int] | int | None = None, add_prefix_space: bool = False, ) -> None: """Construct the tokenizer info. Parameters ---------- - encoded_vocab : Union[List[bytes], List[str]] + encoded_vocab : list[bytes] | list[str] The encoded vocabulary of the tokenizer. vocab_type : VocabType, default: VocabType.RAW The type of the vocabulary. See also VocabType. - vocab_size : Optional[int], default: None + vocab_size : int | None, default: None The size of the vocabulary. If not provided, the vocabulary size will be len(encoded_vocab). - stop_token_ids : Optional[List[int]], default: None + stop_token_ids : list[int] | None, default: None The stop token ids. If not provided, the stop token ids will be auto detected (but may not be correct). @@ -134,8 +133,8 @@ def _is_sentencepiece_tokenizer(tokenizer: PreTrainedTokenizerBase) -> bool: def from_huggingface( tokenizer: PreTrainedTokenizerBase, *, - vocab_size: Optional[int] = None, - stop_token_ids: Optional[Union[List[int], int]] = None, + vocab_size: int | None = None, + stop_token_ids: list[int] | int | None = None, ) -> 'TokenizerInfo': """Construct the tokenizer info from the huggingface tokenizer. This constructor supports various tokenizer backends, including the @@ -154,7 +153,7 @@ def from_huggingface( tokenizer : PreTrainedTokenizerBase The huggingface tokenizer. - vocab_size : Optional[int], default: None + vocab_size : int | None, default: None The vocabulary size **defined by the model** (**not the tokenizer**). This equals to the vocab dimension of the model's lm_head. This is the size of the token mask. @@ -172,7 +171,7 @@ def from_huggingface( model_vocab_size need to be provided for case 2 and 3. If not provided, it will be set to the tokenizer's vocabulary size. - stop_token_ids : Optional[List[int]], default: None + stop_token_ids : list[int] | None, default: None The stop token ids. If not provided, the eos_token_id of the tokenizer will be used. Returns diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index a4a37dc529..f95b2b93ca 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -13,7 +13,7 @@ from functools import partial from multiprocessing.reduction import ForkingPickler from queue import Queue -from typing import Any, Dict, List, Optional +from typing import Any import pybase64 import torch @@ -41,7 +41,7 @@ MAX_LOGPROBS = 1024 -def _construct_stop_or_bad_words(words: List[int] = None): +def _construct_stop_or_bad_words(words: list[int] = None): if words is None or len(words) == 0: return None offsets = list(range(1, len(words) + 1)) @@ -291,7 +291,7 @@ def sleep(self, level: int = 1): for _ in e.map(self.model_comm.sleep, range(self.gpu_count), [level] * self.gpu_count): pass - def wakeup(self, tags: Optional[list[str]] = None): + def wakeup(self, tags: list[str] | None = None): """Wakeup the model.""" if tags is None: tags = ['weights', 'kv_cache'] @@ -311,7 +311,7 @@ def update_params(self, request: UpdateParamsRequest): def _construct(item): """ Deserialize torch.Tensor Args: - item (Tuple[Callable, Tuple]): the return of reduce_tensor + item (tuple[Callable, tuple]): the return of reduce_tensor """ func, args = item args = list(args) @@ -424,7 +424,7 @@ def _func(out: EngineOutput, step: int, **kwargs): def _get_logprobs_impl(logprob_vals: torch.Tensor, logprob_idxs: torch.Tensor, logprob_nums: torch.Tensor, - output_ids: List[int], logprobs: int, offset: int): + output_ids: list[int], logprobs: int, offset: int): """Get logprob of each generated token. Args: @@ -432,7 +432,7 @@ def _get_logprobs_impl(logprob_vals: torch.Tensor, logprob_idxs: torch.Tensor, l 1024 is the max_logprobs that turbomind engine can output logprob_idxs (torch.Tensor): shape (max_new_tokens, 1024) logprob_nums (torch.Tensor): shape (max_new_tokens,) - output_ids (List[int]): new generated token ids + output_ids (list[int]): new generated token ids logprobs (int): top n logprobs to return offset (int): offset to index logprob_vals, logprob_idxs and logprob_nums. It indicates where to start getting logprobs for the current generated tokens `output_ids` @@ -562,7 +562,7 @@ def _create_model_instance(self): model_inst = self.tm_model.model_comm.create_request() return model_inst - def _get_extra_output_processors(self, outputs: Dict[str, torch.Tensor], gen_config: GenerationConfig, + def _get_extra_output_processors(self, outputs: dict[str, torch.Tensor], gen_config: GenerationConfig, input_len: int, metrics: '_tm.RequestMetrics'): def _get_offset(type): @@ -586,8 +586,8 @@ def prepare_embeddings(self, input_embeddings=None, input_embedding_ranges=None) if not input_embeddings: return None, None - assert isinstance(input_embeddings, List) - assert isinstance(input_embedding_ranges, List) + assert isinstance(input_embeddings, list) + assert isinstance(input_embedding_ranges, list) assert len(input_embeddings) == len(input_embedding_ranges) length = sum([x.shape[0] for x in input_embeddings]) @@ -605,7 +605,7 @@ def prepare_embeddings(self, input_embeddings=None, input_embedding_ranges=None) return values, ranges - def prepare_mrope(self, input_meta: Dict[str, Any], input_len: int): + def prepare_mrope(self, input_meta: dict[str, Any], input_len: int): mrope_position_ids = input_meta['mrope_position_ids'] mrope_position_delta = input_meta['mrope_position_delta'] assert mrope_position_ids.size(-1) == input_len @@ -617,7 +617,7 @@ def prepare_inputs(self, gen_config: GenerationConfig, input_embeddings=None, input_embedding_ranges=None, - input_meta: Dict[str, Any] = None): + input_meta: dict[str, Any] = None): """Convert inputs format.""" assert isinstance(input_ids, Sequence) @@ -661,7 +661,7 @@ async def async_stream_infer(self, input_ids, input_embeddings=None, input_embedding_ranges=None, - input_meta: Dict[str, Any] = None, + input_meta: dict[str, Any] = None, sequence_start: bool = True, sequence_end: bool = False, step=0, @@ -673,8 +673,8 @@ async def async_stream_infer(self, Args: session_id (int): the id of a session input_ids (numpy.ndarray): the token ids of a prompt - input_embeddings (List[numpy.ndarray]): embeddings features - input_embedding_ranges (List[Tuple[int,int]]): the begin/end + input_embeddings (list[numpy.ndarray]): embeddings features + input_embedding_ranges (list[tuple[int,int]]): the begin/end offsets of input_embeddings to input_ids sequence_start (bool): indicator for starting a sequence sequence_end (bool): indicator for ending a sequence diff --git a/lmdeploy/version.py b/lmdeploy/version.py index 54a2e46ee5..a5f42d62cc 100644 --- a/lmdeploy/version.py +++ b/lmdeploy/version.py @@ -1,11 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Tuple __version__ = '0.12.1' short_version = __version__ -def parse_version_info(version_str: str) -> Tuple: +def parse_version_info(version_str: str) -> tuple: """Parse version from a string. Args: diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py index a4c926b1d7..8cd179df8a 100644 --- a/lmdeploy/vl/engine.py +++ b/lmdeploy/vl/engine.py @@ -3,7 +3,7 @@ import asyncio import inspect from concurrent.futures import ThreadPoolExecutor -from typing import Any, Dict, List, Optional, Union +from typing import Any import torch @@ -37,7 +37,7 @@ def __init__( model_path: str, backend: str, vision_config: VisionConfig = None, - backend_config: Optional[Union[TurbomindEngineConfig, PytorchEngineConfig]] = None, + backend_config: TurbomindEngineConfig | PytorchEngineConfig | None = None, ): self.model = load_vl_model(model_path, backend, backend_config=backend_config) if vision_config is None: @@ -48,8 +48,8 @@ def __init__( torch.cuda.empty_cache() async def preprocess(self, - messages: List[Dict], - mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]: + messages: list[dict], + mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]: """Preprocess multimodal data in the messages.""" if _accepts_arg(self.model.preprocess, 'mm_processor_kwargs'): future = asyncio.get_event_loop().run_in_executor(self.executor, self.model.preprocess, messages, @@ -60,11 +60,11 @@ async def preprocess(self, outputs = await future return outputs - async def async_infer(self, messages: List[Dict]) -> List[Dict]: + async def async_infer(self, messages: list[dict]) -> list[dict]: """Get multimodal embedding. Args: - messages (List[Dict]): a list of message, which is the output + messages (list[dict]): a list of message, which is the output of `preprocess()` """ future = asyncio.get_event_loop().run_in_executor(self.executor, self.model.forward, messages, @@ -75,28 +75,30 @@ async def async_infer(self, messages: List[Dict]) -> List[Dict]: async def wrap_for_pytorch( self, - messages: List[Dict], + messages: list[dict], chat_template, tokenizer, sequence_start, - tools: Optional[List[object]] = None, - chat_template_kwargs: Optional[Dict] = None, - ) -> List[Dict]: + tools: list[object] | None = None, + chat_template_kwargs: dict | None = None, + ) -> list[dict]: """ Args: - messages (List[Dict]): a list of message, which is supposed to be + messages (list[dict]): a list of message, which is supposed to be the output of `preprocess` + Returns: - a dict which will be passed to pytorch engine_instance's forward. - The dict is like the following: - Dict( - 'prompt': 'the prompt after applying chat template' - 'input_ids': [], - 'multimodal': { - 'pixel_values': torch.Tensor, - ... - ] - ) + list[dict]: a list of dicts passed to pytorch engine_instance's forward. + Each dict has the following structure:: + + { + 'prompt': 'the prompt after applying chat template', + 'input_ids': [], + 'multimodal': { + 'pixel_values': torch.Tensor, + ... + }, + } """ has_input_ids = self.model.has_input_ids(messages) if not has_input_ids: @@ -110,32 +112,35 @@ async def wrap_for_pytorch( result = self.model.to_pytorch_with_input_ids(messages) # clear data for i, message in enumerate(messages): - if isinstance(message['content'], List): + if isinstance(message['content'], list): messages[i]['preprocess'] = None return result async def wrap_for_turbomind( self, - messages: List[Dict], + messages: list[dict], chat_template, tokenizer, sequence_start, - tools: Optional[List[object]] = None, - chat_template_kwargs: Optional[Dict] = None, - ) -> Dict: + tools: list[object] | None = None, + chat_template_kwargs: dict | None = None, + ) -> dict: """ Args: - messages (List[Dict]): a list of message, which is supposed to be + messages (list[dict]): a list of message, which is supposed to be the output of `async_infer` + Returns: - a dict which will be passed to pytorch engine_instance's forward. - The dict is like the following: - Dict( - 'prompt': 'the prompt after applying chat template' - 'input_ids': [], - 'input_embeddings': list[torch.Tensor], - 'input_embedding_ranges': list[torch.Tensor], - ... + dict: a dict passed to turbomind engine_instance's forward. + The dict has the following structure:: + + { + 'prompt': 'the prompt after applying chat template', + 'input_ids': [], + 'input_embeddings': list[torch.Tensor], + 'input_embedding_ranges': list[torch.Tensor], + ... + } """ result = self.model.to_turbomind(messages, chat_template, @@ -145,7 +150,7 @@ async def wrap_for_turbomind( chat_template_kwargs=chat_template_kwargs) # clear data for i, message in enumerate(messages): - if isinstance(message['content'], List): + if isinstance(message['content'], list): messages[i]['preprocess'] = None messages[i]['forward'] = None return result diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py index 521cbc7985..b86c34010e 100644 --- a/lmdeploy/vl/model/base.py +++ b/lmdeploy/vl/model/base.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABC, abstractmethod from itertools import groupby -from typing import Dict, List, Union import numpy as np from mmengine import Registry @@ -14,12 +13,12 @@ class VisionModel(ABC): """Visual model which extract image feature.""" - _arch: Union[str, List[str]] = None + _arch: str | list[str] = None def __init__(self, model_path: str, with_llm: bool = False, - max_memory: Dict[int, int] = None, + max_memory: dict[int, int] = None, hf_config: AutoConfig = None, backend: str = ''): """init.""" @@ -62,7 +61,7 @@ def build_model(self, ): raise NotImplementedError() @abstractmethod - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Preprocess multimodal data in the messages. The derived class, @@ -71,7 +70,7 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: It can integrate the result into the messages list, or insert it to the individual image item. Args: - message(Dict): multimodal data in a dict, which is as follows: + message(dict): multimodal data in a dict, which is as follows: [ {'role': 'user', 'content': 'user prompt'}, {'role': 'assisant', 'content': 'AI reponse'}, @@ -105,24 +104,24 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: """ # noqa raise NotImplementedError() - def has_input_ids(self, messages: List[Dict]) -> bool: + def has_input_ids(self, messages: list[dict]) -> bool: """Check whether the messages contain input_ids directly. Args: - messages (List[Dict]): a list of message, which is supposed to be + messages (list[dict]): a list of message, which is supposed to be the output of `preprocess` Returns: bool: whether the messages contain input_ids directly """ users = [x['content'] for x in messages if x['role'] == 'user'] - return len(users) == 1 and isinstance(users[0], List) and isinstance(users[0][0].get('text', ''), List) + return len(users) == 1 and isinstance(users[0], list) and isinstance(users[0][0].get('text', ''), list) - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: @@ -138,7 +137,7 @@ def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, chat_te pytorch engine. Args: - messages(List[Dict]): the output of `preprocess` + messages(list[dict]): the output of `preprocess` chat_template: the chat template defined in `lmdeploy/model.py` tokenzer: the tokenizer model sequence_start: starting flag of a sequence @@ -154,7 +153,7 @@ def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, chat_ turbomind engine. Args: - messages(List[Dict]): the output of `preprocess` + messages(list[dict]): the output of `preprocess` chat_template: the chat template defined in `lmdeploy/model.py` tokenzer: the tokenizer model sequence_start: starting flag of a sequence @@ -171,13 +170,13 @@ def collect_images(messages): to RGB color space. Args: - messages (List[Tuple[Image, Dict]]): a list of images with their + messages (list[tuple[Image, dict]]): a list of images with their corresponding parameters """ # noqa images = [] for message in messages: content = message['content'] - if not isinstance(content, List): + if not isinstance(content, list): continue images.extend([(x['image'], { k: v @@ -191,13 +190,13 @@ def collect_time_series(messages): from the messages and compile them into a single list. Args: - messages (List[Tuple[np.ndarray, Dict]]): a list of time + messages (list[tuple[np.ndarray, dict]]): a list of time series data with their corresponding parameters """ # noqa time_series = [] for message in messages: content = message['content'] - if not isinstance(content, List): + if not isinstance(content, list): continue time_series.extend([(x['time_series'], { k: v @@ -210,7 +209,7 @@ def IMAGE_TOKEN_included(messages): """Check whether the IMAGE_TOKEN is included in the messages. Args: - messages (List[Dict]): a list of message + messages (list[dict]): a list of message Returns: bool: whether the IMAGE_TOKEN is included in the messages """ @@ -220,7 +219,7 @@ def IMAGE_TOKEN_included(messages): continue if isinstance(content, str) and '' in content: return True - elif isinstance(content, List): + elif isinstance(content, list): content = [x['text'] for x in content if x['type'] == 'text'] if any('' in x for x in content): return True @@ -231,7 +230,7 @@ def to_pytorch_with_input_ids(self, messages): required by pytorch engine when input_ids are provided directly. Args: - messages(List[Dict]): the output of `preprocess` + messages(list[dict]): the output of `preprocess` """ # collect all preprocessing result from messages preps = [x['content'] for x in messages if x['role'] == 'preprocess'] @@ -268,7 +267,7 @@ def to_pytorch_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_star compatible with what is required by pytorch engine. Args: - messages(List[Dict]): the output of `preprocess` + messages(list[dict]): the output of `preprocess` prompt(str): the prompt after applying chat template IMAGE_TOKEN(str): a placeholder where image tokens will be inserted @@ -303,7 +302,7 @@ def to_turbomind_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_st compatible with what is required by turbomind engine. Args: - messages(List[Dict]): the output of `preprocess` + messages(list[dict]): the output of `preprocess` prompt(str): the prompt after applying chat template IMAGE_TOKEN(str): a placeholder where image tokens will be inserted diff --git a/lmdeploy/vl/model/builder.py b/lmdeploy/vl/model/builder.py index 11262483e5..04ac5ab759 100644 --- a/lmdeploy/vl/model/builder.py +++ b/lmdeploy/vl/model/builder.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. import os -from typing import Optional, Union import torch @@ -40,7 +39,7 @@ def load_vl_model(model_path: str, backend: str, with_llm: bool = False, - backend_config: Optional[Union[TurbomindEngineConfig, PytorchEngineConfig]] = None): + backend_config: TurbomindEngineConfig | PytorchEngineConfig | None = None): """Load visual model. Args: diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/vl/model/cogvlm.py index ca5e41a96f..03f2ef224f 100644 --- a/lmdeploy/vl/model/cogvlm.py +++ b/lmdeploy/vl/model/cogvlm.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List from lmdeploy.utils import get_logger from lmdeploy.vl.model.base import VISION_MODELS, VisionModel @@ -39,7 +38,7 @@ def build_model(self): else: raise NotImplementedError('turbomind has not supported cogvlm yet') - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refer to the spec of `super().preprocess`""" images = self.collect_images(messages) outputs = [] diff --git a/lmdeploy/vl/model/deepseek.py b/lmdeploy/vl/model/deepseek.py index 0a1f6c12e9..154eb95c4f 100644 --- a/lmdeploy/vl/model/deepseek.py +++ b/lmdeploy/vl/model/deepseek.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings -from typing import Dict, List import torch from transformers import AutoModelForCausalLM @@ -86,7 +85,7 @@ def build_model(self): self.vision_model = model.vision_model.eval() self.aligner = model.aligner.eval() - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refers to the spec of `super.preprocess()""" images = self.collect_images(messages) outputs = [] @@ -105,12 +104,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: diff --git a/lmdeploy/vl/model/deepseek_vl2.py b/lmdeploy/vl/model/deepseek_vl2.py index a2e4b034ca..13fb71c950 100644 --- a/lmdeploy/vl/model/deepseek_vl2.py +++ b/lmdeploy/vl/model/deepseek_vl2.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import os from contextlib import redirect_stdout -from typing import Dict, List import torch from transformers import AutoConfig @@ -67,7 +66,7 @@ def build_model(self): # TODO, implement for tubomind engine raise NotImplementedError() - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refers to the spec of `super.preprocess()""" images = self.collect_images(messages) @@ -80,8 +79,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: formatted_messages.append(dict(role=message['role'], content=text_content, images=image_content)) # NOTE: DeepseekVLV2Processor inputs - # conversations (List[Dict]): conversations with a list of messages; - # images (List[ImageType]): the list of images; + # conversations (list[dict]): conversations with a list of messages; + # images (list[ImageType]): the list of images; # force_batchify (bool): force batchify the inputs; # inference_mode (bool): if True, then remove the last eos token; prepare = self.image_processor(conversations=formatted_messages, @@ -103,12 +102,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: diff --git a/lmdeploy/vl/model/gemma3_vl.py b/lmdeploy/vl/model/gemma3_vl.py index c2879a6b83..4a0aa7a45c 100644 --- a/lmdeploy/vl/model/gemma3_vl.py +++ b/lmdeploy/vl/model/gemma3_vl.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List, Optional import torch from transformers import AutoConfig, AutoProcessor @@ -12,11 +11,11 @@ class Gemma3ImagesKwargs(ImagesKwargs): - do_pan_and_scan: Optional[bool] - pan_and_scan_min_crop_size: Optional[int] - pan_and_scan_max_num_crops: Optional[int] - pan_and_scan_min_ratio_to_activate: Optional[float] - do_convert_rgb: Optional[bool] + do_pan_and_scan: bool | None + pan_and_scan_min_crop_size: int | None + pan_and_scan_max_num_crops: int | None + pan_and_scan_min_ratio_to_activate: float | None + do_convert_rgb: bool | None class Gemma3ProcessorKwargs(ProcessingKwargs, total=False): @@ -43,7 +42,7 @@ class Gemma3VisionModel(VisionModel): def __init__(self, model_path: str, with_llm: bool = False, - max_memory: Dict[int, int] = None, + max_memory: dict[int, int] = None, hf_config: AutoConfig = None, backend: str = ''): super().__init__(model_path, with_llm, max_memory, hf_config, backend) @@ -61,7 +60,7 @@ def build_model(self): # TODO, implement for tubomind engine raise NotImplementedError() - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refers to `super.preprocess() for spec.""" from transformers.image_utils import make_nested_list_of_images output_kwargs = self.processor._merge_kwargs( @@ -91,12 +90,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: diff --git a/lmdeploy/vl/model/glm4_1v.py b/lmdeploy/vl/model/glm4_1v.py index 3b4b2ab937..27f5e30eeb 100644 --- a/lmdeploy/vl/model/glm4_1v.py +++ b/lmdeploy/vl/model/glm4_1v.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List from transformers import AutoConfig @@ -33,7 +32,7 @@ def build_preprocessor(self): def build_model(self): raise NotImplementedError('turbomind has not supported glm4v yet') - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refer to `super().preprocess()` for spec.""" images = self.collect_images(messages) optional_keys = {'resized_height', 'resized_width', 'min_pixels', 'max_pixels'} diff --git a/lmdeploy/vl/model/glm4_v.py b/lmdeploy/vl/model/glm4_v.py index 81dffbf1ca..1c6af44c6e 100644 --- a/lmdeploy/vl/model/glm4_v.py +++ b/lmdeploy/vl/model/glm4_v.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List from transformers import AutoConfig @@ -44,11 +43,11 @@ def build_model(self): else: raise NotImplementedError('turbomind has not supported glm4v yet') - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refers to the spec of `super.preprocess()""" outputs = [] for message in messages: - if not isinstance(message['content'], List): + if not isinstance(message['content'], list): continue images = [x['image'] for x in message['content'] if x['type'] == 'image'] if len(images) > 1: diff --git a/lmdeploy/vl/model/interns1_pro.py b/lmdeploy/vl/model/interns1_pro.py index e11efbb32a..b0daf4fc2e 100644 --- a/lmdeploy/vl/model/interns1_pro.py +++ b/lmdeploy/vl/model/interns1_pro.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, List, Optional +from typing import Any import numpy as np import torch @@ -40,7 +40,7 @@ def build_preprocessor(self): self.ts_token = getattr(self.processor, 'ts_token', None) self.ts_token_id = getattr(self.processor, 'ts_token_id', None) - def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = None): + def get_processor_args(self, mm_processor_kwargs: dict[str, Any] | None = None): min_pixels = self.processor.image_processor.size['shortest_edge'] max_pixels = self.processor.image_processor.size['longest_edge'] @@ -112,7 +112,7 @@ def time_series_processor(self, ts_input, sr): return dict(ts_values=[ts_input], ts_sr=[sr], ts_lens=[ts_len], num_ts_tokens=[num_ts_tokens]) - def preprocess(self, messages: List[Dict], mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]: + def preprocess(self, messages: list[dict], mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]: """Refer to `super().preprocess()` for spec.""" self.check_time_series_input(messages) @@ -153,7 +153,7 @@ def proc_messages(self, messages, chat_template, sequence_start, - tools: Optional[List[object]] = None, + tools: list[object] | None = None, chat_template_kwargs=None): """Apply chat template to get the prompt.""" chat_template_kwargs = chat_template_kwargs or {} @@ -187,7 +187,7 @@ def ts_to_pytorch_aux(self, messages, prompt, TS_TOKEN, tokenizer, sequence_star compatible with what is required by pytorch engine. Args: - messages(List[Dict]): the output of `preprocess` + messages(list[dict]): the output of `preprocess` prompt(str): the prompt after applying chat template TS_TOKEN(str): a placeholder where time series tokens will be inserted @@ -231,8 +231,8 @@ def to_pytorch(self, chat_template, tokenizer, sequence_start, - tools: Optional[List[object]] = None, - chat_template_kwargs: Optional[Dict] = None, + tools: list[object] | None = None, + chat_template_kwargs: dict | None = None, **kwargs): """Return to the information needed by pytorch engine.""" if self.has_time_series_input: @@ -257,7 +257,7 @@ def build_model(self): pass @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: # TODO: implement for turbomind pass @@ -266,7 +266,7 @@ def to_turbomind(self, chat_template, tokenizer, sequence_start, - chat_template_kwargs: Optional[Dict] = None, + chat_template_kwargs: dict | None = None, **kwargs): # TODO: implement for turbomind pass diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py index 9036866818..1c170daecb 100644 --- a/lmdeploy/vl/model/internvl.py +++ b/lmdeploy/vl/model/internvl.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List, Optional import torch from transformers import AutoConfig, AutoModel, AutoTokenizer, CLIPImageProcessor @@ -72,7 +71,7 @@ class InternVLVisionModel(VisionModel): def __init__(self, model_path: str, with_llm: bool = False, - max_memory: Dict[int, int] = None, + max_memory: dict[int, int] = None, hf_config: AutoConfig = None, backend: str = ''): super().__init__(model_path, with_llm, max_memory, hf_config, backend) @@ -190,7 +189,7 @@ def _forward(self, inputs, max_batch_size): outputs.extend([x.squeeze() for x in feats]) return outputs - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refers to `super.preprocess() for spec.""" images = self.collect_images(messages) outputs = [] @@ -207,12 +206,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: @@ -229,8 +228,8 @@ def proc_messages( messages, chat_template, sequence_start, - tools: Optional[List[object]] = None, - chat_template_kwargs: Optional[Dict] = None, + tools: list[object] | None = None, + chat_template_kwargs: dict | None = None, ): chat_template_kwargs = chat_template_kwargs or {} """Apply chat template to get the prompt.""" @@ -272,8 +271,8 @@ def to_pytorch(self, chat_template, tokenizer, sequence_start, - tools: Optional[List[object]] = None, - chat_template_kwargs: Optional[Dict] = None, + tools: list[object] | None = None, + chat_template_kwargs: dict | None = None, **kwargs): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, @@ -287,8 +286,8 @@ def to_turbomind(self, chat_template, tokenizer, sequence_start, - tools: Optional[List[object]] = None, - chat_template_kwargs: Optional[Dict] = None, + tools: list[object] | None = None, + chat_template_kwargs: dict | None = None, **kwargs): prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, diff --git a/lmdeploy/vl/model/internvl3_hf.py b/lmdeploy/vl/model/internvl3_hf.py index 85eb40bbdc..c28d278de6 100644 --- a/lmdeploy/vl/model/internvl3_hf.py +++ b/lmdeploy/vl/model/internvl3_hf.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List, Optional import torch from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoProcessor @@ -13,9 +12,9 @@ class InternVLImagesKwargs(ImagesKwargs, total=False): - crop_to_patches: Optional[bool] - min_patches: Optional[int] - max_patches: Optional[int] + crop_to_patches: bool | None + min_patches: int | None + max_patches: int | None class InternVLProcessorKwargs(ProcessingKwargs, total=False): @@ -40,7 +39,7 @@ class InternVL3VisionModel(InternVLVisionModel): def __init__(self, model_path: str, with_llm: bool = False, - max_memory: Dict[int, int] = None, + max_memory: dict[int, int] = None, hf_config: AutoConfig = None, backend: str = ''): super().__init__(model_path, with_llm, max_memory, hf_config, backend) @@ -83,7 +82,7 @@ def build_model(self): # avoid randomness in inference. self.model = model.eval() - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refers to `super.preprocess() for spec.""" from transformers.image_utils import make_flat_list_of_images output_kwargs = self.processor._merge_kwargs( @@ -116,12 +115,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: diff --git a/lmdeploy/vl/model/internvl_llava.py b/lmdeploy/vl/model/internvl_llava.py index 67cabfa087..d521bab9fb 100644 --- a/lmdeploy/vl/model/internvl_llava.py +++ b/lmdeploy/vl/model/internvl_llava.py @@ -2,7 +2,6 @@ import warnings from contextlib import contextmanager -from typing import Dict, List import torch from transformers import AutoConfig, AutoModelForCausalLM @@ -125,17 +124,17 @@ def build_model(self): self.vision_tower = model.model.vision_tower.eval() self.mm_projector = model.model.mm_projector.eval() - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refer to `super().preprocess() for spec.""" return super().preprocess(messages) @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: diff --git a/lmdeploy/vl/model/llama4.py b/lmdeploy/vl/model/llama4.py index e0752d7b99..d0e03fd16e 100644 --- a/lmdeploy/vl/model/llama4.py +++ b/lmdeploy/vl/model/llama4.py @@ -1,5 +1,4 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List import torch from transformers import AutoConfig @@ -58,7 +57,7 @@ def build_model(self): # TODO, implement for tubomind engine raise NotImplementedError() - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refers to `super.preprocess() for spec.""" images = self.collect_images(messages) outputs = [] @@ -84,12 +83,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: @@ -123,7 +122,7 @@ def to_pytorch_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_star compatible with what is required by pytorch engine. Args: - messages(List[Dict]): the output of `preprocess` + messages(list[dict]): the output of `preprocess` prompt(str): the prompt after applying chat template IMAGE_TOKEN(str): a placeholder where image tokens will be inserted diff --git a/lmdeploy/vl/model/llava.py b/lmdeploy/vl/model/llava.py index 91da486643..f10bfadf32 100644 --- a/lmdeploy/vl/model/llava.py +++ b/lmdeploy/vl/model/llava.py @@ -5,7 +5,6 @@ import math import warnings from contextlib import contextmanager -from typing import Dict, List import torch from PIL import Image @@ -295,7 +294,7 @@ def encode_images(self, images: torch.Tensor) -> torch.Tensor: image_features = self.mm_projector(image_features) return image_features - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refer to `super().preprocess() for spec.""" images = self.collect_images(messages) outputs = [] @@ -311,12 +310,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/vl/model/llava_hf.py index c66f68be68..80476b55fb 100644 --- a/lmdeploy/vl/model/llava_hf.py +++ b/lmdeploy/vl/model/llava_hf.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings -from typing import Dict, List import torch from transformers import AutoProcessor @@ -55,7 +54,7 @@ def build_model(self): model.eval() self.model = model - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refers to `super.preprocess() for spec.""" images = self.collect_images(messages) outputs = [] @@ -71,12 +70,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: diff --git a/lmdeploy/vl/model/llava_next.py b/lmdeploy/vl/model/llava_next.py index b705f237b8..dd23572d05 100644 --- a/lmdeploy/vl/model/llava_next.py +++ b/lmdeploy/vl/model/llava_next.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import itertools import warnings -from typing import Dict, List import torch @@ -63,7 +62,7 @@ def build_model(self): dtype=torch.half) self.model.eval() - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refers to the spec of `super.preprocess()""" from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches images = self.collect_images(messages) @@ -99,12 +98,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: diff --git a/lmdeploy/vl/model/minicpmv.py b/lmdeploy/vl/model/minicpmv.py index b746f345ba..09d00fc298 100644 --- a/lmdeploy/vl/model/minicpmv.py +++ b/lmdeploy/vl/model/minicpmv.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import itertools import warnings -from typing import Dict, List import torch from PIL.Image import Image @@ -23,7 +22,7 @@ class MiniCPMVModel(VisionModel): def __init__(self, model_path: str, with_llm: bool = False, - max_memory: Dict[int, int] = None, + max_memory: dict[int, int] = None, hf_config: AutoConfig = None, backend: str = ''): super().__init__(model_path, with_llm, max_memory, hf_config, backend) @@ -94,7 +93,7 @@ def _reshape_by_patch(self, slice_images): tgt_sizes.append(torch.Tensor([H, W]).type(torch.int32)) return patches, tgt_sizes - def _preprocess_v2_5(self, image: Image, params: Dict = None) -> Dict: + def _preprocess_v2_5(self, image: Image, params: dict = None) -> dict: """Image preprocessing for MiniCPM-Llama3-V-2_5.""" slice_images, best_grid = self._get_slice_image(image) # pixel_values, tgt_sizes are list of torch tensors @@ -108,7 +107,7 @@ def _preprocess_v2_5(self, image: Image, params: Dict = None) -> Dict: image_tokens=1, image_token_id=self.image_token_id) - def _preprocess_v2_6(self, image: Image, params: Dict = None) -> Dict: + def _preprocess_v2_6(self, image: Image, params: dict = None) -> dict: """Image preprocessing for MiniCPM-V-2_6.""" max_slice_nums = self.image_processor.max_slice_nums use_image_id = self.image_processor.use_image_id @@ -130,11 +129,11 @@ def _preprocess_v2_6(self, image: Image, params: Dict = None) -> Dict: image_token_id=self.image_token_id, use_image_id=use_image_id) - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refer to `super().preprocess() for spec.""" outputs = [] for i, message in enumerate(messages): - if message['role'] != 'user' or not isinstance(message['content'], List): + if message['role'] != 'user' or not isinstance(message['content'], list): continue for item in message['content']: if item['type'] == 'image': @@ -146,12 +145,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py index ab0949fe03..bcf0070ec9 100644 --- a/lmdeploy/vl/model/mllama.py +++ b/lmdeploy/vl/model/mllama.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List from lmdeploy.vl.model.base import VISION_MODELS, VisionModel @@ -24,7 +23,7 @@ def build_preprocessor(self): self.processor = AutoProcessor.from_pretrained(self.model_path) self.image_token_id = 128256 - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refer to the spec of `super().preprocess`""" images = self.collect_images(messages) outputs = [] diff --git a/lmdeploy/vl/model/molmo.py b/lmdeploy/vl/model/molmo.py index c2a12e8412..90b8cb932a 100644 --- a/lmdeploy/vl/model/molmo.py +++ b/lmdeploy/vl/model/molmo.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List import torch from transformers import AutoModelForCausalLM, AutoProcessor @@ -50,10 +49,10 @@ def build_model(self): # avoid randomness in inference. self.model = model.eval() - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refer to the `super.preprocess() for spec.""" for i, message in enumerate(messages): - if not isinstance(message['content'], List): + if not isinstance(message['content'], list): continue images = [x['image'] for x in message['content'] if x['type'] == 'image'] content = [x.get('text', '') for x in message['content'] if x['type'] == 'text'] @@ -75,12 +74,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: @@ -131,7 +130,7 @@ def proc_messages(messages): IMAGE_TOKEN = '' for message in messages: role, content = message['role'], message['content'] - if isinstance(content, List): + if isinstance(content, list): n_images = len([1 for x in content if x['type'] == 'image']) content = [x['text'] for x in content if x['type'] == 'text'] prompt.append(' User: ' + (IMAGE_TOKEN + '\n') * n_images + content[0]) @@ -160,7 +159,7 @@ def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwa for i, message in enumerate(messages): prompt = '' role, content = message['role'], message['content'] - if isinstance(content, List): + if isinstance(content, list): forward_result = message.pop('forward') input_ids = forward_result['input_ids'] embeddings = forward_result['embeddings'] diff --git a/lmdeploy/vl/model/phi3_vision.py b/lmdeploy/vl/model/phi3_vision.py index 683220c29c..77736ffbd4 100644 --- a/lmdeploy/vl/model/phi3_vision.py +++ b/lmdeploy/vl/model/phi3_vision.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List from transformers import AutoProcessor @@ -29,7 +28,7 @@ def build_model(self): else: raise NotImplementedError('turbomind has not supported phi3v yet') - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refers to `super.preprocess() for spec.""" images = self.collect_images(messages) outputs = [] diff --git a/lmdeploy/vl/model/qwen.py b/lmdeploy/vl/model/qwen.py index db54fb5f57..8dfcd85021 100644 --- a/lmdeploy/vl/model/qwen.py +++ b/lmdeploy/vl/model/qwen.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Dict, List import torch from transformers import AutoModelForCausalLM @@ -69,7 +68,7 @@ def build_model(self): self.model = model.transformer.visual.eval() - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refers to `super.preprocess() for spec.""" images = self.collect_images(messages) outputs = [] @@ -85,12 +84,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: diff --git a/lmdeploy/vl/model/qwen3.py b/lmdeploy/vl/model/qwen3.py index 265bf9f729..729cf0ebbd 100644 --- a/lmdeploy/vl/model/qwen3.py +++ b/lmdeploy/vl/model/qwen3.py @@ -1,5 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import Any, Dict, List, Optional +from typing import Any import torch from transformers import AutoProcessor @@ -32,7 +32,7 @@ def build_preprocessor(self): self.image_token_id = tokenizer.encode(self.image_token)[-1] self.mm_processor_kwargs = None - def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = None): + def get_processor_args(self, mm_processor_kwargs: dict[str, Any] | None = None): min_pixels = self.processor.image_processor.size['shortest_edge'] max_pixels = self.processor.image_processor.size['longest_edge'] @@ -68,7 +68,7 @@ def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = Non return min_pixels, max_pixels - def preprocess(self, messages: List[Dict], mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]: + def preprocess(self, messages: list[dict], mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]: """Refer to `super().preprocess()` for spec.""" min_pixels, max_pixels = self.get_processor_args(mm_processor_kwargs) @@ -118,7 +118,7 @@ def to_pytorch(self, chat_template, tokenizer, sequence_start, - chat_template_kwargs: Optional[Dict] = None, + chat_template_kwargs: dict | None = None, **kwargs): """Return to the information needed by pytorch engine.""" prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start, chat_template_kwargs) @@ -129,7 +129,7 @@ def build_model(self): pass @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: # TODO: implement for turbomind pass @@ -138,7 +138,7 @@ def to_turbomind(self, chat_template, tokenizer, sequence_start, - chat_template_kwargs: Optional[Dict] = None, + chat_template_kwargs: dict | None = None, **kwargs): # TODO: implement for turbomind pass diff --git a/lmdeploy/vl/model/utils.py b/lmdeploy/vl/model/utils.py index e584c54c71..9791d8a9ba 100644 --- a/lmdeploy/vl/model/utils.py +++ b/lmdeploy/vl/model/utils.py @@ -1,8 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import inspect +from collections.abc import Callable, MutableSequence from contextlib import contextmanager -from typing import Callable, MutableSequence import torch diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py index cff3b808f5..2117b151dc 100644 --- a/lmdeploy/vl/model/xcomposer2.py +++ b/lmdeploy/vl/model/xcomposer2.py @@ -5,7 +5,7 @@ import sys import warnings from contextlib import contextmanager -from typing import Any, Dict, List, Tuple +from typing import Any import torch from PIL.Image import Image @@ -35,7 +35,7 @@ class ModelType(enum.Enum): XCOMPOSER2D5 = enum.auto() -def get_xcomposer_type(model_path: str) -> Tuple[ModelType, Any]: +def get_xcomposer_type(model_path: str) -> tuple[ModelType, Any]: """Get xcomposer type.""" from transformers.dynamic_module_utils import get_class_from_dynamic_module match_modules = { @@ -90,7 +90,7 @@ class Xcomposer2VisionModel(VisionModel): def __init__(self, model_path: str, with_llm: bool = False, - max_memory: Dict[int, int] = None, + max_memory: dict[int, int] = None, hf_config: AutoConfig = None, backend: str = ''): model_path = model_path.rstrip(os.sep) @@ -180,7 +180,7 @@ def build_model(self): self.model = model.eval() - def _preprocess_2d5(self, image: Image, params: Dict) -> Dict: + def _preprocess_2d5(self, image: Image, params: dict) -> dict: """Image preprocessing for internlm-xcomposer2d5-7b.""" hd_num = params.get('hd_num', 24) image = self.HD_transform(image, hd_num=hd_num) @@ -190,12 +190,12 @@ def _preprocess_2d5(self, image: Image, params: Dict) -> Dict: n_token_per_image = int((h * w + 1) * 400 + 1 + (h + 1) * 20) return pixel_values, n_token_per_image - def _preprocess_7b(self, image: Image, params: Dict) -> Dict: + def _preprocess_7b(self, image: Image, params: dict) -> dict: """Image preprocessing for internlm-xcomposer2-7b.""" pixel_values = self.vis_processor(image).unsqueeze(0).half() return pixel_values, 256 - def _preprocess_4khd_7b(self, image: Image, params: Dict) -> Dict: + def _preprocess_4khd_7b(self, image: Image, params: dict) -> dict: """Image preprocessing for internlm-xcomposer2-4khd-7b.""" image = self.HD_transform(image, hd_num=25) pixel_values = self.vis_processor(image).unsqueeze(0).half() @@ -204,7 +204,7 @@ def _preprocess_4khd_7b(self, image: Image, params: Dict) -> Dict: n_token_per_image = int((h * w + 1) * 144 + 1 + (h + 1) * 12) return pixel_values, n_token_per_image - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refer to `super().preprocess() for spec.""" images = self.collect_images(messages) outputs = [] @@ -220,12 +220,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]: return messages @torch.no_grad() - def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]: + def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]: """Extract image feature. ONLY implement it when the backend is turbomind engine. Args: - messages(List[Dict]): the outputs of `preprocess` + messages(list[dict]): the outputs of `preprocess` max_batch_size(int): the max batch size when forwarding vision model Return: diff --git a/lmdeploy/vl/model/yi.py b/lmdeploy/vl/model/yi.py index 02dd1c83e5..26e52036fb 100644 --- a/lmdeploy/vl/model/yi.py +++ b/lmdeploy/vl/model/yi.py @@ -3,7 +3,6 @@ import os from contextlib import contextmanager from os import path as osp -from typing import Dict, List import torch.nn as nn from transformers import AutoConfig @@ -117,7 +116,7 @@ def build_model(self): with init_yi_model(), disable_transformers_logging(): super().build_model() - def preprocess(self, messages: List[Dict]) -> List[Dict]: + def preprocess(self, messages: list[dict]) -> list[dict]: """Refer to `super().preprocess() for spec.""" images = self.collect_images(messages) outputs = [] diff --git a/lmdeploy/vl/time_series_utils.py b/lmdeploy/vl/time_series_utils.py index 5651f65697..bc0b2c0136 100644 --- a/lmdeploy/vl/time_series_utils.py +++ b/lmdeploy/vl/time_series_utils.py @@ -136,7 +136,7 @@ def _load_csv(source: bytes | str) -> np.ndarray: if isinstance(source, bytes): text = source.decode('utf-8') else: - with open(source, 'r', newline='') as f: + with open(source, newline='') as f: text = f.read() # Parse CSV diff --git a/lmdeploy/vl/utils.py b/lmdeploy/vl/utils.py index a089d06ad7..6a68ae3c90 100644 --- a/lmdeploy/vl/utils.py +++ b/lmdeploy/vl/utils.py @@ -1,7 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import os from io import BytesIO -from typing import Union import pybase64 import requests @@ -12,7 +11,7 @@ logger = get_logger('lmdeploy') -def encode_image_base64(image: Union[str, Image.Image]) -> str: +def encode_image_base64(image: str | Image.Image) -> str: """Encode raw data to base64 format.""" buffered = BytesIO() FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10)) @@ -44,12 +43,12 @@ def encode_image_base64(image: Union[str, Image.Image]) -> str: return res -def load_image_from_base64(image: Union[bytes, str]) -> Image.Image: +def load_image_from_base64(image: bytes | str) -> Image.Image: """Load image from base64 format.""" return Image.open(BytesIO(pybase64.b64decode(image))) -def load_image(image_url: Union[str, Image.Image]) -> Image.Image: +def load_image(image_url: str | Image.Image) -> Image.Image: """Load image from url, local path or openai GPT4V.""" FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10)) headers = { diff --git a/pyproject.toml b/pyproject.toml index d16c624fbb..43b200dd4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,3 +3,20 @@ requires = [ "cmake_build_extension", ] build-backend = "setuptools.build_meta" + +[tool.ruff] +line-length = 120 +target-version = "py310" +extend-exclude = [ + "third_party", + "src/turbomind", +] + +[tool.ruff.lint] +select = [ + "E", "F", "I", "W", + "UP", +] +ignore = [ + "E231", "E741" +] diff --git a/setup.py b/setup.py index e23be7630b..42012da815 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ def readme(): def get_version(): file_path = os.path.join(pwd, version_file) pattern = re.compile(r"\s*__version__\s*=\s*'([0-9A-Za-z.-]+)'") - with open(file_path, 'r') as f: + with open(file_path) as f: for line in f: m = pattern.match(line) if m: @@ -64,7 +64,7 @@ def parse_requirements(fname='requirements.txt', with_version=True): with_version (bool, default=False): if True include version specs Returns: - List[str]: list of requirements items + list[str]: list of requirements items CommandLine: python -c "import setup; print(setup.parse_requirements())" @@ -104,12 +104,11 @@ def parse_line(line): yield info def parse_require_file(fpath): - with open(fpath, 'r') as f: + with open(fpath) as f: for line in f.readlines(): line = line.strip() if line and not line.startswith('#'): - for info in parse_line(line): - yield info + yield from parse_line(line) def gen_packages_items(): if os.path.exists(require_fpath): @@ -130,7 +129,7 @@ def gen_packages_items(): return packages -if get_target_device() == 'cuda' and not os.getenv('DISABLE_TURBOMIND', '').lower() in ('yes', 'true', 'on', 't', '1'): +if get_target_device() == 'cuda' and os.getenv('DISABLE_TURBOMIND', '').lower() not in ('yes', 'true', 'on', 't', '1'): import cmake_build_extension ext_modules = [ diff --git a/tests/pytorch/engine/test_logits_process.py b/tests/pytorch/engine/test_logits_process.py index d635ddcf10..01c6f9777c 100644 --- a/tests/pytorch/engine/test_logits_process.py +++ b/tests/pytorch/engine/test_logits_process.py @@ -1,7 +1,12 @@ # yapf: disable import torch -from transformers.generation.logits_process import (MinPLogitsWarper, RepetitionPenaltyLogitsProcessor, - TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper) +from transformers.generation.logits_process import ( + MinPLogitsWarper, + RepetitionPenaltyLogitsProcessor, + TemperatureLogitsWarper, + TopKLogitsWarper, + TopPLogitsWarper, +) # yapf: enable diff --git a/tests/test_lmdeploy/test_harmony_gpt_oss_parser.py b/tests/test_lmdeploy/test_harmony_gpt_oss_parser.py index e1e5efeb71..7624ff4d17 100644 --- a/tests/test_lmdeploy/test_harmony_gpt_oss_parser.py +++ b/tests/test_lmdeploy/test_harmony_gpt_oss_parser.py @@ -4,7 +4,7 @@ import sys import time import types -from typing import Generator, List +from collections.abc import Generator import pytest import shortuuid @@ -114,11 +114,15 @@ def process(self, token): self.last_content_delta = chr(token) -def _chat_completion_v1(request, token_chunks: List[List[int]]): +def _chat_completion_v1(request, token_chunks: list[list[int]]): from lmdeploy.serve.openai.harmony_utils import GptOssChatParser - from lmdeploy.serve.openai.protocol import (ChatCompletionResponse, ChatCompletionResponseChoice, - ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, - UsageInfo) + from lmdeploy.serve.openai.protocol import ( + ChatCompletionResponse, + ChatCompletionResponseChoice, + ChatCompletionResponseStreamChoice, + ChatCompletionStreamResponse, + UsageInfo, + ) request_id = f'chat-{shortuuid.random()}' created_time = int(time.time()) @@ -147,7 +151,7 @@ def completion_stream_generator() -> Generator['ChatCompletionStreamResponse', N return completion_stream_generator() # Non-stream path: parse all tokens at once using parse_full - tokens: List[int] = [] + tokens: list[int] = [] for c in token_chunks: tokens.extend(c) message = parser.parse_full(tokens) @@ -160,7 +164,7 @@ def completion_stream_generator() -> Generator['ChatCompletionStreamResponse', N usage=UsageInfo()) -def _stream_parse(request, token_chunks: List[List[int]]): +def _stream_parse(request, token_chunks: list[list[int]]): from lmdeploy.serve.openai.protocol import DeltaMessage content = '' @@ -190,7 +194,7 @@ def _stream_parse(request, token_chunks: List[List[int]]): return content, reasoning_content, tool_calls -def _t(s: str) -> List[int]: +def _t(s: str) -> list[int]: return [ord(c) for c in s] @@ -223,7 +227,7 @@ def _t(s: str) -> List[int]: @pytest.mark.parametrize(('token_chunks', 'expects'), [ (TOKENS_SINGLE_CALL_TWO_CHUNKS, [TestExpects('get_weather', 'Paris, France')]), ]) -def test_parser_stream_basic(token_chunks: List[List[int]], expects: List[TestExpects]): +def test_parser_stream_basic(token_chunks: list[list[int]], expects: list[TestExpects]): from lmdeploy.serve.openai.protocol import ChatCompletionRequest _install_openai_harmony_stub() @@ -274,7 +278,7 @@ def test_parser_stream_interleaved_channels(): (TOKENS_TWO_CALLS_SAME_FUNC, [TestExpects('get_weather', 'Tokyo'), TestExpects('get_weather', 'Kyoto')]), ]) -def test_parser_stream_two_calls_same_func(token_chunks: List[List[int]], expects: List[TestExpects]): +def test_parser_stream_two_calls_same_func(token_chunks: list[list[int]], expects: list[TestExpects]): from lmdeploy.serve.openai.protocol import ChatCompletionRequest _install_openai_harmony_stub() @@ -307,7 +311,7 @@ def test_open_tool_call_no_args(): (TOKENS_TWO_CALLS_SAME_FUNC, [TestExpects('get_weather', 'Tokyo'), TestExpects('get_weather', 'Kyoto')]), ]) -def test_parser_nonstream(token_chunks: List[List[int]], expects: List[TestExpects]): +def test_parser_nonstream(token_chunks: list[list[int]], expects: list[TestExpects]): from lmdeploy.serve.openai.protocol import ChatCompletionRequest _install_openai_harmony_stub() diff --git a/tests/test_lmdeploy/test_lite/test_quantization/test_utils/test_cal_qparams.py b/tests/test_lmdeploy/test_lite/test_quantization/test_utils/test_cal_qparams.py index dadc5478e0..51f912b057 100644 --- a/tests/test_lmdeploy/test_lite/test_quantization/test_utils/test_cal_qparams.py +++ b/tests/test_lmdeploy/test_lite/test_quantization/test_utils/test_cal_qparams.py @@ -1,9 +1,14 @@ # yapf: disable import torch -from lmdeploy.lite.utils import (cal_qparams_per_channel_absmax, cal_qparams_per_channel_minmax, - cal_qparams_per_group_absmax, cal_qparams_per_group_minmax, - cal_qparams_per_tensor_absmax, cal_qparams_per_tensor_minmax) +from lmdeploy.lite.utils import ( + cal_qparams_per_channel_absmax, + cal_qparams_per_channel_minmax, + cal_qparams_per_group_absmax, + cal_qparams_per_group_minmax, + cal_qparams_per_tensor_absmax, + cal_qparams_per_tensor_minmax, +) # yapf: enable diff --git a/tests/test_lmdeploy/test_messages.py b/tests/test_lmdeploy/test_messages.py index 1948e80b9f..1fdf73402f 100644 --- a/tests/test_lmdeploy/test_messages.py +++ b/tests/test_lmdeploy/test_messages.py @@ -1,4 +1,3 @@ -from typing import List import pytest @@ -12,7 +11,7 @@ def test_engine_generation_config(): stop_token_ids = tokenizer.encode('', add_bos=False) config.convert_stop_bad_words_to_ids(tokenizer) assert stop_token_ids == config.stop_token_ids - assert isinstance(config.stop_token_ids, List) and \ + assert isinstance(config.stop_token_ids, list) and \ isinstance(config.stop_token_ids[0], int) diff --git a/tests/test_lmdeploy/test_qwen3_parser.py b/tests/test_lmdeploy/test_qwen3_parser.py index 3a837d73a3..b3d52b47b6 100644 --- a/tests/test_lmdeploy/test_qwen3_parser.py +++ b/tests/test_lmdeploy/test_qwen3_parser.py @@ -1,15 +1,23 @@ import collections import json import time -from typing import Generator, List, Tuple, Union +from collections.abc import Generator import pytest import shortuuid from lmdeploy.serve.openai.api_server import VariableInterface -from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, - ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, - ChatMessage, DeltaMessage, DeltaToolCall, UsageInfo) +from lmdeploy.serve.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, + ChatCompletionResponseChoice, + ChatCompletionResponseStreamChoice, + ChatCompletionStreamResponse, + ChatMessage, + DeltaMessage, + DeltaToolCall, + UsageInfo, +) from lmdeploy.serve.openai.reasoning_parser.qwen_qwq_reasoning_parser import QwenQwQReasoningParser from lmdeploy.serve.openai.tool_parser.qwen3_parser import Qwen3ToolParser @@ -18,10 +26,10 @@ class DummyTokenizer: - def decode(self, token_ids: List[int]) -> str: + def decode(self, token_ids: list[int]) -> str: return ' '.join(map(str, token_ids)) - def encode(self, text: str) -> List[int]: + def encode(self, text: str) -> list[int]: return [ord(c) for c in text] @@ -174,7 +182,7 @@ def encode(self, text: str) -> List[int]: def _chat_completion_v1( request: ChatCompletionRequest, - text_sequence: List[str]) -> Union[ChatCompletionResponse, Generator[ChatCompletionStreamResponse, None, None]]: + text_sequence: list[str]) -> ChatCompletionResponse | Generator[ChatCompletionStreamResponse, None, None]: request_id = f'chat-{shortuuid.random()}' created_time = int(time.time()) model_name = request.model @@ -239,7 +247,7 @@ def completion_stream_generator() -> Generator[ChatCompletionStreamResponse, Non if request.tool_choice != 'none' and VariableInterface.tool_parser is not None: tool_call_info = VariableInterface.tool_parser.extract_tool_calls(text, request=request) text, tool_calls = tool_call_info.content, tool_call_info.tool_calls - if isinstance(tool_calls, List) and len(tool_calls): + if isinstance(tool_calls, list) and len(tool_calls): if finish_reason == 'stop': finish_reason = 'tool_calls' @@ -263,7 +271,7 @@ def completion_stream_generator() -> Generator[ChatCompletionStreamResponse, Non ) -def _stream_parse(request: ChatCompletionRequest, text_sequence: List[str]) -> Tuple[str, str, List[DeltaToolCall]]: +def _stream_parse(request: ChatCompletionRequest, text_sequence: list[str]) -> tuple[str, str, list[DeltaToolCall]]: # Call parser.extract_tool_calls_streaming with delta_text specified in `DELTA_TEXT_SEQUENCE`. # `current_text` and `previous_text` init values and update logic # can be found in lmdeploy/serve/openai/api_server.py:455-523. @@ -297,7 +305,7 @@ def _stream_parse(request: ChatCompletionRequest, text_sequence: List[str]) -> T (DELTA_TEXT_SEQUENCE_MULTIPLE_CALLS, [TestExpects('get_weather', '北京'), TestExpects('get_weather', '上海')]), ]) -def test_parser_stream(text_sequence: List[str], expects: List[TestExpects]): +def test_parser_stream(text_sequence: list[str], expects: list[TestExpects]): tokenizer = DummyTokenizer() VariableInterface.tool_parser = Qwen3ToolParser(tokenizer=tokenizer) VariableInterface.reasoning_parser = QwenQwQReasoningParser(tokenizer=tokenizer) @@ -317,7 +325,7 @@ def test_parser_stream(text_sequence: List[str], expects: List[TestExpects]): (DELTA_TEXT_SEQUENCE_MULTIPLE_CALLS, [TestExpects('get_weather', '北京'), TestExpects('get_weather', '上海')]), ]) -def test_parser_nonstream(text_sequence: List[str], expects: List[TestExpects]): +def test_parser_nonstream(text_sequence: list[str], expects: list[TestExpects]): tokenizer = DummyTokenizer() VariableInterface.tool_parser = Qwen3ToolParser(tokenizer=tokenizer) VariableInterface.reasoning_parser = QwenQwQReasoningParser(tokenizer=tokenizer) diff --git a/tests/test_lmdeploy/test_qwen3coder_parser.py b/tests/test_lmdeploy/test_qwen3coder_parser.py index b84735a40c..b2b08d4cbb 100644 --- a/tests/test_lmdeploy/test_qwen3coder_parser.py +++ b/tests/test_lmdeploy/test_qwen3coder_parser.py @@ -1,15 +1,23 @@ import collections import json import time -from typing import Generator, List, Tuple, Union +from collections.abc import Generator import pytest import shortuuid from lmdeploy.serve.openai.api_server import VariableInterface -from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, - ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, - ChatMessage, DeltaMessage, DeltaToolCall, UsageInfo) +from lmdeploy.serve.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, + ChatCompletionResponseChoice, + ChatCompletionResponseStreamChoice, + ChatCompletionStreamResponse, + ChatMessage, + DeltaMessage, + DeltaToolCall, + UsageInfo, +) from lmdeploy.serve.openai.tool_parser.qwen3coder_parser import Qwen3CoderToolParser TestExpects = collections.namedtuple('TestExpects', 'func_name kwargs') @@ -17,10 +25,10 @@ class DummyTokenizer: - def decode(self, token_ids: List[int]) -> str: + def decode(self, token_ids: list[int]) -> str: return ' '.join(map(str, token_ids)) - def encode(self, text: str) -> List[int]: + def encode(self, text: str) -> list[int]: return [ord(c) for c in text] @@ -55,7 +63,7 @@ def encode(self, text: str) -> List[int]: def _chat_completion_v1( request: ChatCompletionRequest, - text_sequence: List[str]) -> Union[ChatCompletionResponse, Generator[ChatCompletionStreamResponse, None, None]]: + text_sequence: list[str]) -> ChatCompletionResponse | Generator[ChatCompletionStreamResponse, None, None]: request_id = f'chat-{shortuuid.random()}' created_time = int(time.time()) model_name = request.model @@ -121,7 +129,7 @@ def completion_stream_generator() -> Generator[ChatCompletionStreamResponse, Non if request.tool_choice != 'none' and has_tool: tool_call_info = VariableInterface.tool_parser.extract_tool_calls(text, request=request) text, tool_calls = tool_call_info.content, tool_call_info.tool_calls - if isinstance(tool_calls, List) and len(tool_calls): + if isinstance(tool_calls, list) and len(tool_calls): if finish_reason == 'stop': finish_reason = 'tool_calls' @@ -146,7 +154,7 @@ def completion_stream_generator() -> Generator[ChatCompletionStreamResponse, Non ) -def _stream_parse(request: ChatCompletionRequest, text_sequence: List[str]) -> Tuple[str, str, List[DeltaToolCall]]: +def _stream_parse(request: ChatCompletionRequest, text_sequence: list[str]) -> tuple[str, str, list[DeltaToolCall]]: content = '' reasoning_content = '' tool_calls = {} @@ -185,7 +193,7 @@ def _stream_parse(request: ChatCompletionRequest, text_sequence: List[str]) -> T TestExpects('get_weather', {'location': '上海'}) ]), ]) -def test_parser_stream(text_sequence: List[str], expects: List[TestExpects]): +def test_parser_stream(text_sequence: list[str], expects: list[TestExpects]): tokenizer = DummyTokenizer() VariableInterface.tool_parser = Qwen3CoderToolParser(tokenizer=tokenizer) VariableInterface.reasoning_parser = None @@ -212,7 +220,7 @@ def test_parser_stream(text_sequence: List[str], expects: List[TestExpects]): TestExpects('get_weather', {'location': '上海'}) ]), ]) -def test_parser_nonstream(text_sequence: List[str], expects: List[TestExpects]): +def test_parser_nonstream(text_sequence: list[str], expects: list[TestExpects]): tokenizer = DummyTokenizer() VariableInterface.tool_parser = Qwen3CoderToolParser(tokenizer=tokenizer) VariableInterface.reasoning_parser = None diff --git a/tests/test_lmdeploy/test_turbomind/test_converter.py b/tests/test_lmdeploy/test_turbomind/test_converter.py index 4cbb5dc1e1..e91e0fefaa 100644 --- a/tests/test_lmdeploy/test_turbomind/test_converter.py +++ b/tests/test_lmdeploy/test_turbomind/test_converter.py @@ -1,8 +1,10 @@ # yapf: disable from lmdeploy import TurbomindEngineConfig from lmdeploy.turbomind import update_parallel_config -from lmdeploy.turbomind.deploy.converter import (get_input_model_registered_name, - get_output_model_registered_name_and_config) +from lmdeploy.turbomind.deploy.converter import ( + get_input_model_registered_name, + get_output_model_registered_name_and_config, +) from lmdeploy.turbomind.deploy.source_model.base import INPUT_MODELS # yapf: enable diff --git a/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py b/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py index 7dad2f4e09..1b575e574d 100644 --- a/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py +++ b/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py @@ -93,7 +93,8 @@ def mock_messages(self): @pytest.fixture(scope='module') def mock_IMAGE_TOKEN_messages(self): return [ - dict(role='system', content='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。'), + dict(role='system', content='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、' \ + '清华大学及多家合作单位联合开发的多模态大语言模型。'), dict(role='user', content=[ dict(type='text', text='\nDescribe the following images in detail'),