diff --git a/.github/scripts/action_tools.py b/.github/scripts/action_tools.py
index b21ded4bc4..e296c4d760 100644
--- a/.github/scripts/action_tools.py
+++ b/.github/scripts/action_tools.py
@@ -7,14 +7,13 @@
 import subprocess
 import time
 from collections import OrderedDict
-from typing import List
 
 import fire
 import pandas as pd
 from mmengine.config import Config
 
 
-def run_cmd(cmd_lines: List[str], log_path: str, cwd: str = None):
+def run_cmd(cmd_lines: list[str], log_path: str, cwd: str = None):
     """
     Args:
         cmd_lines: (list[str]): A command in multiple line style.
@@ -43,7 +42,7 @@ def run_cmd(cmd_lines: List[str], log_path: str, cwd: str = None):
 
     if return_code != 0:
         logging.error(f'Got shell abnormal return code={return_code}')
-        with open(log_path, 'r') as f:
+        with open(log_path) as f:
             content = f.read()
             logging.error(f'Log error message\n{content}')
     return return_code
@@ -61,7 +60,7 @@ def add_summary(csv_path: str):
     Args:
         csv_path (str): Input csv file.
     """
-    with open(csv_path, 'r') as fr:
+    with open(csv_path) as fr:
         lines = fr.readlines()
         header = lines[0].strip().split(',')
         n_col = len(header)
@@ -75,8 +74,8 @@ def add_summary(csv_path: str):
         _append_summary('\n')
 
 
-def evaluate(models: List[str],
-             datasets: List[str],
+def evaluate(models: list[str],
+             datasets: list[str],
              workspace: str,
              evaluate_type: str,
              max_num_workers: int = 8,
@@ -146,12 +145,12 @@ def evaluate(models: List[str],
         # print csv_txt to screen
         csv_txt = csv_file.replace('.csv', '.txt')
         if os.path.exists(csv_txt):
-            with open(csv_txt, 'r') as f:
+            with open(csv_txt) as f:
                 print(f.read())
 
         # parse evaluation results from csv file
         model_results = OrderedDict()
-        with open(csv_file, 'r') as f:
+        with open(csv_file) as f:
             lines = f.readlines()
             for line in lines[1:]:
                 row = line.strip().split(',')
@@ -160,7 +159,7 @@ def evaluate(models: List[str],
                     model_results[row[0]] = row[-1]
         crows_pairs_json = glob.glob(os.path.join(work_dir, '*/results/*/crows_pairs.json'), recursive=True)
         if len(crows_pairs_json) == 1:
-            with open(crows_pairs_json[0], 'r') as f:
+            with open(crows_pairs_json[0]) as f:
                 acc = json.load(f)['accuracy']
                 acc = f'{float(acc):.2f}'  # noqa E231
                 model_results['crows_pairs'] = acc
@@ -238,9 +237,9 @@ def generate_benchmark_report(report_path: str):
 
                         grouped_df = merged_df.groupby(merged_df.columns[0])
                     if 'generation' not in backend_subfolder:
-                        average_values = grouped_df.pipe((lambda group: {
+                        average_values = grouped_df.pipe(lambda group: {
                             'mean': group.mean(numeric_only=True).round(decimals=3)
-                        }))['mean']
+                        })['mean']
                         average_values.to_csv(average_csv_path, index=True)
                         avg_df = pd.read_csv(average_csv_path)
                         merged_df = pd.concat([merged_df, avg_df], ignore_index=True)
@@ -253,7 +252,7 @@ def generate_benchmark_report(report_path: str):
 
 
 def generate_csv_from_profile_result(file_path: str, out_path: str):
-    with open(file_path, 'r') as f:
+    with open(file_path) as f:
         data = f.readlines()
         data = [json.loads(line) for line in data]
 
diff --git a/.github/scripts/doc_link_checker.py b/.github/scripts/doc_link_checker.py
index 8858b414dc..2b20d00f07 100644
--- a/.github/scripts/doc_link_checker.py
+++ b/.github/scripts/doc_link_checker.py
@@ -17,7 +17,7 @@ def make_parser():
 
 
 def analyze_doc(home, path):
-    print('analyze {}'.format(path))
+    print(f'analyze {path}')
     problem_list = []
     code_block = 0
     with open(path) as f:
diff --git a/.github/scripts/eval_base_config.py b/.github/scripts/eval_base_config.py
index 25e374639d..9b966f3b7c 100644
--- a/.github/scripts/eval_base_config.py
+++ b/.github/scripts/eval_base_config.py
@@ -11,39 +11,51 @@
     from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import cmmlu_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.crowspairs.crowspairs_ppl import crowspairs_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.drop.drop_gen_a2697c import drop_datasets  # noqa: F401, E501
+
     # Corebench v1.7
-    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
-        GaokaoBench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import (
+        GaokaoBench_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import gpqa_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
-        hellaswag_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \
-        humaneval_datasets as humaneval_v2_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \
-        humaneval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import (
+        hellaswag_datasets,  # noqa: F401, E501
+    )
+    from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import (
+        humaneval_datasets as humaneval_v2_datasets,  # noqa: F401, E501
+    )
+    from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import (
+        humaneval_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import math_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
-        mathbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import (
+        mathbench_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import sanitized_mbpp_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import mmlu_pro_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import nq_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.race.race_few_shot_ppl import race_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \
-        BoolQ_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import (
+        BoolQ_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
-        triviaqa_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \
-        wikibench_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
-        winogrande_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import (
+        triviaqa_datasets,  # noqa: F401, E501
+    )
+    from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import (
+        wikibench_datasets,  # noqa: F401, E501
+    )
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import (
+        winogrande_datasets,  # noqa: F401, E501
+    )
+
     # Summary Groups
     from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.GaokaoBench import GaokaoBench_summary_groups  # noqa: F401, E501
-    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
-        mathbench_2024_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import (
+        mathbench_2024_summary_groups,  # noqa: F401, E501
+    )
     from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups  # noqa: F401, E501
 
diff --git a/.github/scripts/eval_chat_config.py b/.github/scripts/eval_chat_config.py
index cbf5c51766..26caa0b103 100644
--- a/.github/scripts/eval_chat_config.py
+++ b/.github/scripts/eval_chat_config.py
@@ -10,88 +10,120 @@
     from opencompass.configs.datasets.ceval.ceval_gen_2daf24 import ceval_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.crowspairs.crowspairs_gen_381af0 import crowspairs_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
-        GaokaoBench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import (
+        GaokaoBench_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
-        hellaswag_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import (
+        hellaswag_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_a0fc46 import sanitized_mbpp_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
-        mmlu_pro_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import (
+        mmlu_pro_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.nq.nq_open_1shot_gen_01cf41 import nq_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import \
-        triviaqa_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import \
-        winogrande_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import (
+        triviaqa_datasets,  # noqa: F401, E501
+    )
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_gen_b36770 import (
+        winogrande_datasets,  # noqa: F401, E501
+    )
+
     # read models
-    from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import \
-        models as hf_baichuan2_chat_7b  # noqa: F401, E501
+    from opencompass.configs.models.baichuan.hf_baichuan2_7b_chat import (
+        models as hf_baichuan2_chat_7b,  # noqa: F401, E501
+    )
     from opencompass.configs.models.gemma.hf_gemma2_9b_it import models as hf_gemma2_9b_it  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
-        models as hf_internlm2_5_7b_chat  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
-        models as hf_internlm2_5_20b_chat  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import \
-        models as hf_internlm2_chat_7b  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import \
-        models as hf_internlm2_chat_20b  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
-        models as lmdeploy_internlm2_5_7b_chat  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
-        models as lmdeploy_internlm2_5_20b_chat  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
-        models as lmdeploy_internlm2_chat_7b  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import \
-        models as lmdeploy_internlm2_chat_20b  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
-        models as lmdeploy_internlm3_8b_instruct  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm_chat_7b import \
-        models as lmdeploy_internlm_chat_7b  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import (
+        models as hf_internlm2_5_7b_chat,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import (
+        models as hf_internlm2_5_20b_chat,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_7b import (
+        models as hf_internlm2_chat_7b,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.hf_internlm2_chat_20b import (
+        models as hf_internlm2_chat_20b,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
+        models as lmdeploy_internlm2_5_7b_chat,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
+        models as lmdeploy_internlm2_5_20b_chat,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import (
+        models as lmdeploy_internlm2_chat_7b,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_20b import (
+        models as lmdeploy_internlm2_chat_20b,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import (
+        models as lmdeploy_internlm3_8b_instruct,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm_chat_7b import (
+        models as lmdeploy_internlm_chat_7b,  # noqa: F401, E501
+    )
     from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models as hf_llama2_chat_7b  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
-        models as hf_llama3_1_8b_instruct  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
-        models as hf_llama_3_8b_instruct  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \
-        models as lmdeploy_llama2_7b_chat  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
-        models as lmdeploy_llama3_1_8b_instruct  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
-        models as lmdeploy_llama3_8b_instruct  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_1 import \
-        models as hf_mistral_chat_7b  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
-        models as hf_mixtral_chat_8x7b  # noqa: F401, E501
-    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import \
-        models as lmdeploy_qwen2_5_7b_instruct  # noqa: F401, E501
-    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import \
-        models as lmdeploy_qwen2_5_32b_instruct  # noqa: F401, E501
+    from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import (
+        models as hf_llama3_1_8b_instruct,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import (
+        models as hf_llama_3_8b_instruct,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import (
+        models as lmdeploy_llama2_7b_chat,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import (
+        models as lmdeploy_llama3_1_8b_instruct,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import (
+        models as lmdeploy_llama3_8b_instruct,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_1 import (
+        models as hf_mistral_chat_7b,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import (
+        models as hf_mixtral_chat_8x7b,  # noqa: F401, E501
+    )
     from opencompass.configs.models.qwen.hf_qwen1_5_7b_chat import models as hf_qwen1_5_chat_7b  # noqa: F401, E501
-    from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b_chat import \
-        models as hf_qwen1_5_moe_a2_7b_chat  # noqa: F401, E501
+    from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b_chat import (
+        models as hf_qwen1_5_moe_a2_7b_chat,  # noqa: F401, E501
+    )
     from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import models as hf_qwen2_7b_instruct  # noqa: F401, E501
     from opencompass.configs.models.qwen.hf_qwen_7b_chat import models as hf_qwen_chat_7b  # noqa: F401, E501
-    from opencompass.configs.models.qwen.lmdeploy_qwen1_5_7b_chat import \
-        models as lmdeploy_qwen1_5_7b_chat  # noqa: F401, E501
-    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
-        models as lmdeploy_qwen2_7b_instruct  # noqa: F401, E501
-    from opencompass.configs.models.qwen.lmdeploy_qwen_7b_chat import \
-        models as lmdeploy_qwen_7b_chat  # noqa: F401, E501
+    from opencompass.configs.models.qwen.lmdeploy_qwen1_5_7b_chat import (
+        models as lmdeploy_qwen1_5_7b_chat,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
+        models as lmdeploy_qwen2_7b_instruct,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen.lmdeploy_qwen_7b_chat import (
+        models as lmdeploy_qwen_7b_chat,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import (
+        models as lmdeploy_qwen2_5_32b_instruct,  # noqa: F401, E501
+    )
+
     # Summary Groups
     from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.cmmlu import cmmlu_summary_groups  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.ds1000 import ds1000_summary_groups  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.GaokaoBench import GaokaoBench_summary_groups  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.humanevalx import humanevalx_summary_groups  # noqa: F401, E501
-    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
-        mathbench_2024_summary_groups  # noqa: F401, E501
+    from opencompass.configs.summarizers.groups.mathbench_v1_2024 import (
+        mathbench_2024_summary_groups,  # noqa: F401, E501
+    )
     from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.mmlu_pro import mmlu_pro_summary_groups  # noqa: F401, E501
     from opencompass.configs.summarizers.groups.scicode import scicode_summary_groups  # noqa: F401, E501
diff --git a/.github/scripts/eval_regression_base_models.py b/.github/scripts/eval_regression_base_models.py
index 235ac812a0..26c1cccccd 100644
--- a/.github/scripts/eval_regression_base_models.py
+++ b/.github/scripts/eval_regression_base_models.py
@@ -7,41 +7,57 @@
     from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import gsm8k_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.race.race_ppl import race_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
-        winogrande_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import (
+        winogrande_datasets,  # noqa: F401, E501
+    )
+
     # read hf models - chat models
     from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import models as lmdeploy_glm4_9b_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
-        models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
-        models as lmdeploy_deepseek_67b_base_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import (
+        models as lmdeploy_deepseek_7b_base_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import (
+        models as lmdeploy_deepseek_67b_base_model,  # noqa: F401, E501
+    )
     from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2 import lmdeploy_deepseek_v2_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.lmdeploy_gemma_9b import models as pytorch_gemma_9b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
-        models as lmdeploy_internlm2_1_8b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
-        models as lmdeploy_internlm2_5_7b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import \
-        models as lmdeploy_internlm2_20b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
-        models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
-        models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
-        models as lmdeploy_llama3_8b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import \
-        models as lmdeploy_llama3_70b_model  # noqa: F401, E501
-    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
-        models as lmdeploy_qwen2_5_1_5b_model  # noqa: F401, E501
-    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
-        models as lmdeploy_qwen2_5_7b_model  # noqa: F401, E501
-    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b import \
-        models as lmdeploy_qwen2_5_32b_model  # noqa: F401, E501
-    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b import \
-        models as lmdeploy_qwen2_5_72b_model  # noqa: F401, E501
-    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \
-        models as lmdeploy_qwen2_1_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import (
+        models as lmdeploy_internlm2_1_8b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import (
+        models as lmdeploy_internlm2_5_7b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import (
+        models as lmdeploy_internlm2_20b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import (
+        models as lmdeploy_internlm2_base_7b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import (
+        models as lmdeploy_llama3_1_8b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import (
+        models as lmdeploy_llama3_8b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import (
+        models as lmdeploy_llama3_70b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import (
+        models as lmdeploy_qwen2_1_5b_model,  # noqa: F401, E501
+    )
     from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import models as lmdeploy_qwen2_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import (
+        models as lmdeploy_qwen2_5_1_5b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import (
+        models as lmdeploy_qwen2_5_7b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b import (
+        models as lmdeploy_qwen2_5_32b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b import (
+        models as lmdeploy_qwen2_5_72b_model,  # noqa: F401, E501
+    )
     from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b import models as lmdeploy_yi_1_5_9b_model  # noqa: F401, E501
 
     from .volc import infer as volc_infer  # noqa: F401, E501
diff --git a/.github/scripts/eval_regression_chat_models.py b/.github/scripts/eval_regression_chat_models.py
index de7edb8e2f..9495ca6b4f 100644
--- a/.github/scripts/eval_regression_chat_models.py
+++ b/.github/scripts/eval_regression_chat_models.py
@@ -7,71 +7,104 @@
     from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import ifeval_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.math.math_0shot_gen_11c4b5 import math_datasets  # noqa: F401, E501
+
     # read hf models - chat models
-    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
-        models as lmdeploy_glm4_9b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
-        models as lmdeploy_deepseek_r1_distill_qwen_32b_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
-        models as lmdeploy_deepseek_v2_5_1210_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
-        models as lmdeploy_deepseek_v2_lite_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
-        models as pytorch_gemma_9b_it_model  # noqa: F401, E501
-    from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \
-        models as pytorch_gemma_27b_it_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
-        models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
-        models as lmdeploy_internlm2_5_20b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
-        models as lmdeploy_internlm2_chat_1_8b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
-        models as lmdeploy_internlm2_chat_1_8b_sft_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
-        models as lmdeploy_internlm2_chat_7b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
-        models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
-        models as lmdeploy_internlm3_8b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \
-        models as lmdeploy_llama2_7b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
-        models as lmdeploy_llama3_1_8b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
-        models as lmdeploy_llama3_2_3b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.lmdeploy_llama3_3_70b_instruct import \
-        models as lmdeploy_llama3_3_70b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
-        models as lmdeploy_llama3_8b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
-        models as lmdeploy_mistral_large_instruct_2411_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \
-        models as lmdeploy_mistral_nemo_instruct_2407_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
-        models as lmdeploy_mistral_small_instruct_2409_model  # noqa: F401, E501
-    from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
-        models as lmdeploy_nemotron_70b_instruct_hf_model  # noqa: F401, E501
-    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import \
-        models as lmdeploy_qwen2_5_0_5b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import \
-        models as lmdeploy_qwen2_5_3b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
-        models as lmdeploy_qwen2_5_14b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import \
-        models as lmdeploy_qwen2_5_32b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
-        models as lmdeploy_qwen2_5_72b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
-        models as lmdeploy_qwen2_1_5b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
-        models as lmdeploy_qwen2_7b_instruct_model  # noqa: F401, E501
-    from opencompass.configs.models.yi.lmdeploy_yi_1_5_6b_chat import \
-        models as lmdeploy_yi_1_5_6b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \
-        models as lmdeploy_yi_1_5_9b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.yi.lmdeploy_yi_1_5_34b_chat import \
-        models as lmdeploy_yi_1_5_34b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import (
+        models as lmdeploy_glm4_9b_chat_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import (
+        models as lmdeploy_deepseek_r1_distill_qwen_32b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import (
+        models as lmdeploy_deepseek_v2_5_1210_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import (
+        models as lmdeploy_deepseek_v2_lite_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import (
+        models as pytorch_gemma_9b_it_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import (
+        models as pytorch_gemma_27b_it_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
+        models as lmdeploy_internlm2_5_7b_chat_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import (
+        models as lmdeploy_internlm2_5_20b_chat_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import (
+        models as lmdeploy_internlm2_chat_1_8b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import (
+        models as lmdeploy_internlm2_chat_1_8b_sft_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import (
+        models as lmdeploy_internlm2_chat_7b_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import (
+        models as lmdeploy_internlm2_chat_7b_sft_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import (
+        models as lmdeploy_internlm3_8b_instruct_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import (
+        models as lmdeploy_llama2_7b_chat_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import (
+        models as lmdeploy_llama3_1_8b_instruct_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import (
+        models as lmdeploy_llama3_2_3b_instruct_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_3_70b_instruct import (
+        models as lmdeploy_llama3_3_70b_instruct_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import (
+        models as lmdeploy_llama3_8b_instruct_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import (
+        models as lmdeploy_mistral_large_instruct_2411_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import (
+        models as lmdeploy_mistral_nemo_instruct_2407_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import (
+        models as lmdeploy_mistral_small_instruct_2409_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import (
+        models as lmdeploy_nemotron_70b_instruct_hf_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import (
+        models as lmdeploy_qwen2_1_5b_instruct_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import (
+        models as lmdeploy_qwen2_7b_instruct_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import (
+        models as lmdeploy_qwen2_5_0_5b_instruct_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import (
+        models as lmdeploy_qwen2_5_3b_instruct_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as lmdeploy_qwen2_5_14b_instruct_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import (
+        models as lmdeploy_qwen2_5_32b_instruct_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import (
+        models as lmdeploy_qwen2_5_72b_instruct_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.yi.lmdeploy_yi_1_5_6b_chat import (
+        models as lmdeploy_yi_1_5_6b_chat_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import (
+        models as lmdeploy_yi_1_5_9b_chat_model,  # noqa: F401, E501
+    )
+    from opencompass.configs.models.yi.lmdeploy_yi_1_5_34b_chat import (
+        models as lmdeploy_yi_1_5_34b_chat_model,  # noqa: F401, E501
+    )
 
     from .volc import infer as volc_infer  # noqa: F401, E501
 
diff --git a/.github/scripts/eval_stable_object_config.py b/.github/scripts/eval_stable_object_config.py
index be20037806..328f25d75f 100644
--- a/.github/scripts/eval_stable_object_config.py
+++ b/.github/scripts/eval_stable_object_config.py
@@ -5,35 +5,43 @@
     # choose a list of datasets
     from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import ARC_c_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.CHARM.charm_reason_cot_only_gen_f7b7d3 import \
-        charm_reason_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.CHARM.charm_reason_cot_only_gen_f7b7d3 import (
+        charm_reason_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import cmmlu_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import drop_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import ds1000_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import gsm8k_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
-        hellaswag_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import \
-        humaneval_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import (
+        hellaswag_datasets,  # noqa: F401, E501
+    )
+    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_159614 import (
+        humaneval_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import humanevalx_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.LCBench.lcbench_gen_5ff288 import LCBench_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.math.math_0shot_gen_393424 import math_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import mathbench_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
-        sanitized_mbpp_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import (
+        sanitized_mbpp_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
-        mmlu_pro_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import (
+        mmlu_pro_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.race.race_cot_gen_d95929 import race_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.scicode.scicode_gen_085b98 import SciCode_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \
-        BoolQ_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
-        teval_datasets as teval_en_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
-        teval_datasets as teval_zh_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import (
+        BoolQ_datasets,  # noqa: F401, E501
+    )
+    from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import (
+        teval_datasets as teval_en_datasets,  # noqa: F401, E501
+    )
+    from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import (
+        teval_datasets as teval_zh_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets  # noqa: F401, E501
     from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import wikibench_datasets  # noqa: F401, E501
 
diff --git a/.github/scripts/eval_stable_subject_config.py b/.github/scripts/eval_stable_subject_config.py
index c868c7b1e3..e829815221 100644
--- a/.github/scripts/eval_stable_subject_config.py
+++ b/.github/scripts/eval_stable_subject_config.py
@@ -6,19 +6,25 @@
 
 with read_base():
     # choose a list of datasets
-    from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import \
-        alignbench_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import \
-        alpacav2_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import \
-        arenahard_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import \
-        compassarena_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.alignbench.alignbench_judgeby_critiquellm import (
+        alignbench_datasets,  # noqa: F401, E501
+    )
+    from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import (
+        alpacav2_datasets,  # noqa: F401, E501
+    )
+    from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare import (
+        arenahard_datasets,  # noqa: F401, E501
+    )
+    from opencompass.configs.datasets.subjective.compassarena.compassarena_compare import (
+        compassarena_datasets,  # noqa: F401, E501
+    )
     from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge import fofo_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import \
-        mtbench101_datasets  # noqa: F401, E501
-    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import \
-        wildbench_datasets  # noqa: F401, E501
+    from opencompass.configs.datasets.subjective.multiround.mtbench101_judge import (
+        mtbench101_datasets,  # noqa: F401, E501
+    )
+    from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge import (
+        wildbench_datasets,  # noqa: F401, E501
+    )
 
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets') and 'wildbench' not in k), [])
 datasets += wildbench_datasets
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d23faca57a..0d4cf15d30 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,21 +1,11 @@
 repos:
-  - repo: https://github.com/PyCQA/flake8
-    rev: 5.0.4
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.15.4
     hooks:
-      - id: flake8
-        args: ['--extend-ignore=E231', "--max-line-length=120"]
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.11.5
-    hooks:
-      - id: isort
-        args: ["--line-length=120"]
-  - repo: https://github.com/google/yapf
-    rev: v0.43.0
-    hooks:
-      - id: yapf
-        args: ['-i', '--style={based_on_style: pep8, column_limit: 120}']
+      - id: ruff-check
+        args: ["--fix", "--exit-non-zero-on-fix"]
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
+    rev: v6.0.0
     hooks:
       - id: trailing-whitespace
       - id: check-yaml
@@ -23,8 +13,6 @@ repos:
       - id: requirements-txt-fixer
       - id: double-quote-string-fixer
       - id: check-merge-conflict
-      - id: fix-encoding-pragma
-        args: ["--remove"]
       - id: mixed-line-ending
         args: ["--fix=lf"]
 
diff --git a/autotest/interface/pipeline/test_pipeline_func.py b/autotest/interface/pipeline/test_pipeline_func.py
index 7ea918415d..abd7f49d4e 100644
--- a/autotest/interface/pipeline/test_pipeline_func.py
+++ b/autotest/interface/pipeline/test_pipeline_func.py
@@ -3,9 +3,14 @@
 import pydantic
 import pytest
 from utils.config_utils import set_device_env_variable, unset_device_env_variable
-from utils.pipeline_chat import (assert_pipeline_batch_return, assert_pipeline_batch_stream_return,
-                                 assert_pipeline_common_log, assert_pipeline_single_return,
-                                 assert_pipeline_single_stream_return, save_pipeline_common_log)
+from utils.pipeline_chat import (
+    assert_pipeline_batch_return,
+    assert_pipeline_batch_stream_return,
+    assert_pipeline_common_log,
+    assert_pipeline_single_return,
+    assert_pipeline_single_stream_return,
+    save_pipeline_common_log,
+)
 from utils.restful_return_check import has_repeated_fragment
 
 from lmdeploy import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig, pipeline
@@ -206,15 +211,15 @@ def run_pipeline_testcase_special_words_false(config, model, backend, file_name)
     model_path = '/'.join([config.get('model_path'), model])
     backend_config = backend(tp=2)
     pipe = init_pipeline(model_path, backend_config=backend_config)
-    prompt = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' + \
-        '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \
-        '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' + \
-        '发送含有 Python >代码的消息时，它将在该环境中执行。这个工具适用于多种场景，' + \
-        '如数据分析或处理（包括数据操作、统计分析、图表绘制），复杂的计算问题（解决数学和物理' + \
-        '难题），编程示例（理解编程概念或特性），文本处理和分析（比如文本解析和自然语言处理），机器学习和数据科学（用于' + \
-        '展示模型训练和数据可视化），以及文件操作和数据导入（处理CSV、JSON等格式的文件）。<|im_end|>\n' + \
-        '<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$，计算曲线积分：$I=\\int_L' + \
-        '{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant'
+    prompt = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' \
+        '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \
+        '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \
+        '发送含有 Python >代码的消息时，它将在该环境中执行。这个工具适用于多种场景，' \
+        '如数据分析或处理（包括数据操作、统计分析、图表绘制），复杂的计算问题（解决数学和物理' \
+        '难题），编程示例（理解编程概念或特性），文本处理和分析（比如文本解析和自然语言处理），' \
+        '机器学习和数据科学（用于展示模型训练和数据可视化），以及文件操作和数据导入（处理CSV、' \
+        'JSON等格式的文件）。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$，' \
+        '计算曲线积分：$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant'
     gen_config = GenerationConfig(skip_special_tokens=False)
     response = pipe(prompt, gen_config=gen_config)
     result = '<|action_start|><|interpreter|>' in response.text
@@ -226,15 +231,15 @@ def run_pipeline_testcase_special_words_true(config, model, backend, file_name):
     model_path = '/'.join([config.get('model_path'), model])
     backend_config = backend(tp=2)
     pipe = init_pipeline(model_path, backend_config=backend_config)
-    prompt = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' + \
-        '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \
-        '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' + \
-        '发送含有 Python >代码的消息时，它将在该环境中执行。这个工具适用于多种场景，' + \
-        '如数据分析或处理（包括数据操作、统计分析、图表绘制），复杂的计算问题（解决数学和物理' + \
-        '难题），编程示例（理解编程概念或特性），文本处理和分析（比如文本解析和自然语言处理），机器学习和数据科学（用于' + \
-        '展示模型训练和数据可视化），以及文件操作和数据导入（处理CSV、JSON等格式的文件）。<|im_end|>\n' + \
-        '<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$，计算曲线积分：$I=\\int_L' + \
-        '{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant'
+    prompt = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' \
+        '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \
+        '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \
+        '发送含有 Python >代码的消息时，它将在该环境中执行。这个工具适用于多种场景，' \
+        '如数据分析或处理（包括数据操作、统计分析、图表绘制），复杂的计算问题（解决数学和物理' \
+        '难题），编程示例（理解编程概念或特性），文本处理和分析（比如文本解析和自然语言处理），' \
+        '机器学习和数据科学（用于展示模型训练和数据可视化），以及文件操作和数据导入（处理CSV、' \
+        'JSON等格式的文件）。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$，' \
+        '计算曲线积分：$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant'
     gen_config = GenerationConfig(skip_special_tokens=True)
     response = pipe(prompt, gen_config=gen_config)
     result = '<|action_start|><|interpreter|>' not in response.text
diff --git a/autotest/interface/restful/test_restful_chat_completions_v1.py b/autotest/interface/restful/test_restful_chat_completions_v1.py
index 464c915a53..67e166a568 100644
--- a/autotest/interface/restful/test_restful_chat_completions_v1.py
+++ b/autotest/interface/restful/test_restful_chat_completions_v1.py
@@ -3,8 +3,11 @@
 import pytest
 from openai import OpenAI
 from utils.constant import BACKEND_LIST, RESTFUL_MODEL_LIST
-from utils.restful_return_check import (assert_chat_completions_batch_return, assert_chat_completions_stream_return,
-                                        has_repeated_fragment)
+from utils.restful_return_check import (
+    assert_chat_completions_batch_return,
+    assert_chat_completions_stream_return,
+    has_repeated_fragment,
+)
 
 from lmdeploy.serve.openai.api_client import APIClient, get_model_list
 
@@ -223,15 +226,15 @@ def test_array_stopwords_streaming(self, backend, model_case):
 
     @pytest.mark.internlm2_5
     def test_special_words(self, backend, model_case):
-        message = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' + \
-                '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' + \
-                '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' + \
-                '发送含有 Python >代码的消息时，它将在该环境中执行。这个工具适用于多种场景，' + \
-                '如数据分析或处理（包括数据操作、统计分析、图表绘制），复杂的计算问题（解决数学和物理' + \
-                '难题），编程示例（理解编程概念或特性），文本处理和分析（比如文本解析和自然语言处理），机器学习和数据科学（用于' + \
-                '展示模型训练和数据可视化），以及文件操作和数据导入（处理CSV、JSON等格式的文件）。<|im_end|>\n' + \
-                '<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$，计算曲线积分：$I=\\int_L' + \
-                '{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant'
+        message = '<|im_start|>system\n当开启工具以及代码时，根据需求选择合适的工具进行调用\n' \
+                '<|im_end|><|im_start|>system name=<|interpreter|>\n你现在已经' \
+                '能够在一个有状态的 Jupyter 笔记本环境中运行 Python 代码。当你向 python ' \
+                '发送含有 Python >代码的消息时，它将在该环境中执行。这个工具适用于多种场景，' \
+                '如数据分析或处理（包括数据操作、统计分析、图表绘制），复杂的计算问题（解决数学和物理' \
+                '难题），编程示例（理解编程概念或特性），文本处理和分析（比如文本解析和自然语言处理），' \
+                '机器学习和数据科学（用于展示模型训练和数据可视化），以及文件操作和数据导入（处理CSV、' \
+                'JSON等格式的文件）。<|im_end|>\n<|im_start|>user\n设 $L$ 为圆周$x^2+y^2=2x$，' \
+                '计算曲线积分：$I=\\int_L{x\\mathrm{d}s}=$<|im_end|>\n<|im_start|>assistant'
         api_client = APIClient(BASE_URL)
         model_name = api_client.available_models[0]
         for output in api_client.chat_completions_v1(model=model_name,
diff --git a/autotest/interface/restful/test_restful_completions_v1.py b/autotest/interface/restful/test_restful_completions_v1.py
index 03316f6679..d3be161dfe 100644
--- a/autotest/interface/restful/test_restful_completions_v1.py
+++ b/autotest/interface/restful/test_restful_completions_v1.py
@@ -178,7 +178,8 @@ def test_batch_prompt_order(self, backend, model_case):
         api_client = APIClient(BASE_URL)
         model_name = api_client.available_models[0]
         for item in api_client.completions_v1(model=model_name,
-                                              prompt=['你好', '今天天气怎么样', '你是谁', '帮我写一首以梅花为主题的五言律诗', '5+2等于多少'],
+                                              prompt=['你好', '今天天气怎么样', '你是谁',
+                                                      '帮我写一首以梅花为主题的五言律诗', '5+2等于多少'],
                                               max_tokens=400,
                                               min_tokens=50):
             print(str(item))
diff --git a/autotest/interface/restful/test_restful_generate.py b/autotest/interface/restful/test_restful_generate.py
index cf4c9a463e..389ee47adb 100644
--- a/autotest/interface/restful/test_restful_generate.py
+++ b/autotest/interface/restful/test_restful_generate.py
@@ -4,7 +4,7 @@
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
-from typing import Any, Dict, List
+from typing import Any
 
 import pytest
 import requests
@@ -115,8 +115,8 @@ def status_code(self):
             return resp
 
     def _validate_generation_response(self,
-                                      data: Dict[str, Any],
-                                      expected_fields: List[str] = None,
+                                      data: dict[str, Any],
+                                      expected_fields: list[str] = None,
                                       validate_tokens: bool = True,
                                       expect_logprobs: bool = False,
                                       validate_experts: bool = False) -> None:
diff --git a/autotest/tools/chat/test_command_chat_hf_pytorch.py b/autotest/tools/chat/test_command_chat_hf_pytorch.py
index 970eb34469..317a23b3d7 100644
--- a/autotest/tools/chat/test_command_chat_hf_pytorch.py
+++ b/autotest/tools/chat/test_command_chat_hf_pytorch.py
@@ -1,6 +1,11 @@
 import pytest
-from tools.common_case_config import (MODELSCOPE_CONFIG, PYTORCH_LORA_TEST_LLM_GPU1, PYTORCH_LORA_TEST_LLM_GPU2,
-                                      PYTORCH_PR_TEST_LLM_GPU1, PYTORCH_PR_TEST_LLM_GPU2)
+from tools.common_case_config import (
+    MODELSCOPE_CONFIG,
+    PYTORCH_LORA_TEST_LLM_GPU1,
+    PYTORCH_LORA_TEST_LLM_GPU2,
+    PYTORCH_PR_TEST_LLM_GPU1,
+    PYTORCH_PR_TEST_LLM_GPU2,
+)
 from utils.config_utils import get_func_config_list, get_workerid
 from utils.run_client_chat import run_tests
 
diff --git a/autotest/tools/chat/test_command_chat_hf_turbomind.py b/autotest/tools/chat/test_command_chat_hf_turbomind.py
index 46eda4af4b..d8ba6a9472 100644
--- a/autotest/tools/chat/test_command_chat_hf_turbomind.py
+++ b/autotest/tools/chat/test_command_chat_hf_turbomind.py
@@ -1,7 +1,11 @@
 import pytest
-from tools.common_case_config import (MODELSCOPE_CONFIG, TURBOMIND_FALLBACK_TEST_LLM_GPU1,
-                                      TURBOMIND_FALLBACK_TEST_LLM_GPU2, TURBOMIND_PR_TEST_LLM_GPU1,
-                                      TURBOMIND_PR_TEST_LLM_GPU2)
+from tools.common_case_config import (
+    MODELSCOPE_CONFIG,
+    TURBOMIND_FALLBACK_TEST_LLM_GPU1,
+    TURBOMIND_FALLBACK_TEST_LLM_GPU2,
+    TURBOMIND_PR_TEST_LLM_GPU1,
+    TURBOMIND_PR_TEST_LLM_GPU2,
+)
 from utils.config_utils import get_func_config_list, get_workerid
 from utils.run_client_chat import run_tests
 
diff --git a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
index bc41a8156c..9443be79e1 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_pytorch_llm.py
@@ -1,7 +1,12 @@
 import pytest
-from tools.common_case_config import (MODELSCOPE_CONFIG, PYTORCH_LORA_TEST_LLM_GPU1, PYTORCH_LORA_TEST_LLM_GPU2,
-                                      PYTORCH_PR_TEST_LLM_GPU1, PYTORCH_PR_TEST_LLM_GPU2,
-                                      SPECULATIVE_DECODING_PIPELINE_TEST_LLM)
+from tools.common_case_config import (
+    MODELSCOPE_CONFIG,
+    PYTORCH_LORA_TEST_LLM_GPU1,
+    PYTORCH_LORA_TEST_LLM_GPU2,
+    PYTORCH_PR_TEST_LLM_GPU1,
+    PYTORCH_PR_TEST_LLM_GPU2,
+    SPECULATIVE_DECODING_PIPELINE_TEST_LLM,
+)
 from utils.config_utils import get_func_config_list, get_workerid
 from utils.pipeline_chat import run_pipeline_llm_test
 
diff --git a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
index 42801eabb9..894ac1bb59 100644
--- a/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
+++ b/autotest/tools/pipeline/test_pipeline_chat_turbomind_llm.py
@@ -1,7 +1,11 @@
 import pytest
-from tools.common_case_config import (MODELSCOPE_CONFIG, TURBOMIND_FALLBACK_TEST_LLM_GPU1,
-                                      TURBOMIND_FALLBACK_TEST_LLM_GPU2, TURBOMIND_PR_TEST_LLM_GPU1,
-                                      TURBOMIND_PR_TEST_LLM_GPU2)
+from tools.common_case_config import (
+    MODELSCOPE_CONFIG,
+    TURBOMIND_FALLBACK_TEST_LLM_GPU1,
+    TURBOMIND_FALLBACK_TEST_LLM_GPU2,
+    TURBOMIND_PR_TEST_LLM_GPU1,
+    TURBOMIND_PR_TEST_LLM_GPU2,
+)
 from utils.config_utils import get_func_config_list, get_workerid
 from utils.pipeline_chat import run_pipeline_llm_test
 
diff --git a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
index c27822eb47..f0c4d7bf07 100644
--- a/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_pytorch_llm.py
@@ -1,9 +1,16 @@
 import time
 
 import pytest
-from tools.common_case_config import (MODELSCOPE_CONFIG, PYTORCH_LORA_TEST_LLM_GPU1, PYTORCH_LORA_TEST_LLM_GPU2,
-                                      PYTORCH_PR_TEST_LLM_GPU1, PYTORCH_PR_TEST_LLM_GPU2, REASONING_TEST_LLM,
-                                      SPECULATIVE_DECODING_RESTFUL_TEST_LLM, TOOLCALL_TEST_LLM)
+from tools.common_case_config import (
+    MODELSCOPE_CONFIG,
+    PYTORCH_LORA_TEST_LLM_GPU1,
+    PYTORCH_LORA_TEST_LLM_GPU2,
+    PYTORCH_PR_TEST_LLM_GPU1,
+    PYTORCH_PR_TEST_LLM_GPU2,
+    REASONING_TEST_LLM,
+    SPECULATIVE_DECODING_RESTFUL_TEST_LLM,
+    TOOLCALL_TEST_LLM,
+)
 from utils.config_utils import get_case_str_by_config, get_func_config_list, get_workerid
 from utils.constant import PROXY_PORT
 from utils.proxy_distributed_utils import ApiServerPerTest, proxy_worker_node_wait
diff --git a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
index 8d2cd95c9c..a7460b6e72 100644
--- a/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
+++ b/autotest/tools/restful/test_restful_chat_hf_turbomind_llm.py
@@ -1,8 +1,14 @@
 import pytest
-from tools.common_case_config import (MODELSCOPE_CONFIG, REASONING_TEST_LLM, TOOLCALL_TEST_LLM,
-                                      TURBOMIND_FALLBACK_TEST_LLM_GPU1, TURBOMIND_FALLBACK_TEST_LLM_GPU2,
-                                      TURBOMIND_LOGPROBS_TEST_LLM_GPU2, TURBOMIND_PR_TEST_LLM_GPU1,
-                                      TURBOMIND_PR_TEST_LLM_GPU2)
+from tools.common_case_config import (
+    MODELSCOPE_CONFIG,
+    REASONING_TEST_LLM,
+    TOOLCALL_TEST_LLM,
+    TURBOMIND_FALLBACK_TEST_LLM_GPU1,
+    TURBOMIND_FALLBACK_TEST_LLM_GPU2,
+    TURBOMIND_LOGPROBS_TEST_LLM_GPU2,
+    TURBOMIND_PR_TEST_LLM_GPU1,
+    TURBOMIND_PR_TEST_LLM_GPU2,
+)
 from utils.config_utils import get_func_config_list, get_workerid
 from utils.run_restful_chat import run_llm_test, run_logprob_test, run_reasoning_case, run_tools_case
 
diff --git a/autotest/utils/common_utils.py b/autotest/utils/common_utils.py
index 3a7fcd473f..f54c3aa489 100644
--- a/autotest/utils/common_utils.py
+++ b/autotest/utils/common_utils.py
@@ -1,14 +1,13 @@
 import os
 import subprocess
 import sys
-from typing import Tuple
 
 
 def execute_command_with_logging(cmd,
                                  log_file_path: str,
                                  timeout: int = 3600,
                                  env=None,
-                                 should_print=True) -> Tuple[bool, str]:
+                                 should_print=True) -> tuple[bool, str]:
     if env is None:
         env = os.environ.copy()
 
diff --git a/autotest/utils/config_utils.py b/autotest/utils/config_utils.py
index 362a97ac67..5e65681546 100644
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -1,7 +1,7 @@
 import copy
 import os
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import yaml
 
@@ -12,7 +12,7 @@
 SUFFIX_INNER_W8A8 = '-inner-w8a8'
 
 
-def resolve_extra_params(extra_params: Dict[str, Any], model_base_path: str) -> None:
+def resolve_extra_params(extra_params: dict[str, Any], model_base_path: str) -> None:
     """Resolve relative model paths in extra_params to absolute paths.
 
     Centralised helper so that every call-site does not need its own
@@ -37,10 +37,10 @@ def resolve_extra_params(extra_params: Dict[str, Any], model_base_path: str) ->
 
 
 def get_func_config_list(backend: str,
-                         parallel_config: Dict[str, int],
+                         parallel_config: dict[str, int],
                          model_type: str = 'chat_model',
                          func_type: str = 'func',
-                         extra: Optional[Dict[str, Any]] = None) -> List[Dict]:
+                         extra: dict[str, Any] | None = None) -> list[dict]:
     """Generate all valid running config combinations (communicator + quant
     policy + model).
 
@@ -51,7 +51,7 @@ def get_func_config_list(backend: str,
         func_type: Test func type filter, default: func
         extra: extra config to update in each run config dict
     Returns:
-        List[Dict]: All valid run config dicts
+        list[dict]: All valid run config dicts
     """
     config = get_config()
     device = config.get('device', 'cuda')
@@ -127,7 +127,7 @@ def get_func_config_list(backend: str,
     return run_configs
 
 
-def get_cli_common_param(run_config: Dict[str, Any]) -> str:
+def get_cli_common_param(run_config: dict[str, Any]) -> str:
     """Generate cli common params string by run config dict."""
     backend = run_config.get('backend')
     model = run_config.get('model')
@@ -162,7 +162,7 @@ def get_cli_common_param(run_config: Dict[str, Any]) -> str:
     return ' '.join(cli_params).strip()
 
 
-def get_cli_str(config: Dict[str, Any]) -> str:
+def get_cli_str(config: dict[str, Any]) -> str:
     cli_str = []
     # Extra params
     for key, value in config.items():
@@ -181,7 +181,7 @@ def get_cli_str(config: Dict[str, Any]) -> str:
     return ' '.join(cli_str)
 
 
-def get_parallel_config(config: Dict, model_name: str) -> List[Dict[str, int]]:
+def get_parallel_config(config: dict, model_name: str) -> list[dict[str, int]]:
     """Get matched parallel config dict by model name, default tp:1 if no
     match."""
     result = []
@@ -201,23 +201,23 @@ def get_parallel_config(config: Dict, model_name: str) -> List[Dict[str, int]]:
     return result if result else [{'tp': 1}]
 
 
-def _extract_models_from_config(config_value: Any) -> List[str]:
+def _extract_models_from_config(config_value: Any) -> list[str]:
     """Extract flat model name list from config value (dict/list supported)"""
     models = []
-    if isinstance(config_value, Dict):
+    if isinstance(config_value, dict):
         for model_list in config_value.values():
-            if isinstance(model_list, List):
+            if isinstance(model_list, list):
                 models.extend([m for m in model_list if isinstance(m, str)])
-    elif isinstance(config_value, List):
+    elif isinstance(config_value, list):
         models.extend([m for m in config_value if isinstance(m, str)])
     return models
 
 
-def get_model_list(config: Dict,
+def get_model_list(config: dict,
                    backend: str,
-                   parallel_config: Dict[str, int] = None,
+                   parallel_config: dict[str, int] = None,
                    model_type: str = 'chat_model',
-                   func_type: str = 'func') -> List[str]:
+                   func_type: str = 'func') -> list[str]:
     """Get filtered model list with quantization extended models by
     backend/parallel config/model type/func type.
 
@@ -228,7 +228,7 @@ def get_model_list(config: Dict,
         model_type: Model type, default: chat_model
         func_type: Test func type filter, default: func
     Returns:
-        List[str]: Base models + quantization extended models
+        list[str]: Base models + quantization extended models
     """
     model_config_key = f'{backend}_{model_type}'
     all_models = []
@@ -252,7 +252,7 @@ def get_model_list(config: Dict,
     return extended_models
 
 
-def _filter_by_test_func_type(config: Dict, model_list: List[str], func_type: str) -> List[str]:
+def _filter_by_test_func_type(config: dict, model_list: list[str], func_type: str) -> list[str]:
     """Filter model list by test function type, return intersection of two
     model sets."""
     if func_type == 'func':
@@ -292,7 +292,7 @@ def _extend_pytorch_quant_models(quant_config: dict, base_models: list, target_l
             target_list.append(model_name + SUFFIX_INNER_W8A8)
 
 
-def _is_kvint_model(config: Dict, backend: str, model: str, quant_policy: int) -> bool:
+def _is_kvint_model(config: dict, backend: str, model: str, quant_policy: int) -> bool:
     """Check if model supports the kv quantization policy, quant_policy=0
     always return True."""
     if quant_policy == 0:
@@ -308,7 +308,7 @@ def _base_model_name(model: str) -> str:
     return model.replace('-inner-4bits', '').replace('-inner-w8a8', '').replace('-inner-gptq', '')
 
 
-def get_quantization_model_list(type: str) -> List[str]:
+def get_quantization_model_list(type: str) -> list[str]:
     """Get quantization model list by specified quant type(awq/gptq/w8a8)"""
     config = get_config()
     quant_model_list = []
@@ -340,7 +340,7 @@ def get_quantization_model_list(type: str) -> List[str]:
     return quant_model_list
 
 
-def get_config() -> Dict[str, Any]:
+def get_config() -> dict[str, Any]:
     """Load & get yaml config file, auto adapt device env & update log path."""
     # Get device env & match config file path
     env_tag = os.environ.get('TEST_ENV')
@@ -350,7 +350,7 @@ def get_config() -> Dict[str, Any]:
     if env_tag and not os.path.exists(config_path):
         config_path = 'autotest/config.yml'
     # Load yaml config file safely
-    with open(config_path, 'r', encoding='utf-8') as f:
+    with open(config_path, encoding='utf-8') as f:
         config = yaml.load(f.read(), Loader=yaml.SafeLoader)
 
     # Deep copy config to avoid modify raw data, update log path with github run id
@@ -370,7 +370,7 @@ def get_config() -> Dict[str, Any]:
     return config_copy
 
 
-def get_cuda_prefix_by_workerid(worker_id: Optional[str], parallel_config: Dict[str, int] = None) -> Optional[str]:
+def get_cuda_prefix_by_workerid(worker_id: str | None, parallel_config: dict[str, int] = None) -> str | None:
     """Get cuda/ascend visible devices env prefix by worker id & parallel
     config."""
     para_conf = parallel_config or {}
@@ -387,7 +387,7 @@ def get_cuda_prefix_by_workerid(worker_id: Optional[str], parallel_config: Dict[
     return f'ASCEND_RT_VISIBLE_DEVICES={cuda_id}' if device_type == 'ascend' else f'CUDA_VISIBLE_DEVICES={cuda_id}'
 
 
-def get_cuda_id_by_workerid(worker_id: Optional[str], tp_num: int = 1) -> Optional[str]:
+def get_cuda_id_by_workerid(worker_id: str | None, tp_num: int = 1) -> str | None:
     """Get cuda id str by worker id and tp num, return None if invalid worker
     id."""
     if worker_id is None or 'gw' not in worker_id:
@@ -398,7 +398,7 @@ def get_cuda_id_by_workerid(worker_id: Optional[str], tp_num: int = 1) -> Option
     return ','.join([str(cuda_num + i) for i in range(tp_num)])
 
 
-def get_workerid(worker_id: Optional[str]) -> int:
+def get_workerid(worker_id: str | None) -> int:
     """Parse numeric worker id from worker id str, return 0 if invalid worker
     id."""
     if worker_id is None or 'gw' not in worker_id:
@@ -413,7 +413,7 @@ def is_quantization_model(model: str) -> bool:
     return any(key in lower_name for key in ('awq', '4bits', 'w4', 'int4'))
 
 
-def _get_communicator_list(config: Dict, backend: str, parallel_config: Dict[str, int] = None) -> List[str]:
+def _get_communicator_list(config: dict, backend: str, parallel_config: dict[str, int] = None) -> list[str]:
     """Get available communicator list by device and parallel config."""
     device = config.get('device', None)
 
@@ -429,7 +429,7 @@ def _get_communicator_list(config: Dict, backend: str, parallel_config: Dict[str
     return ['nccl', 'cuda-ipc']
 
 
-def set_device_env_variable(worker_id, parallel_config: Dict[str, int] = None):
+def set_device_env_variable(worker_id, parallel_config: dict[str, int] = None):
     """Set device environment variable based on the device type."""
     device = os.environ.get('DEVICE', 'cuda')
 
@@ -460,13 +460,13 @@ def unset_device_env_variable():
             del os.environ['CUDA_VISIBLE_DEVICES']
 
 
-def is_model_in_list(config: Dict, parallel_config: Dict[str, int], model: str) -> bool:
+def is_model_in_list(config: dict, parallel_config: dict[str, int], model: str) -> bool:
     """Check if model matches the target parallel config."""
     model_config = get_parallel_config(config, model)
     return parallel_config in model_config
 
 
-def get_case_str_by_config(run_config: Dict[str, Any], is_simple: bool = True) -> str:
+def get_case_str_by_config(run_config: dict[str, Any], is_simple: bool = True) -> str:
     """Generate case name string by run config dict."""
     model_name = run_config['model']
     backend_type = run_config['backend']
@@ -491,7 +491,7 @@ def get_case_str_by_config(run_config: Dict[str, Any], is_simple: bool = True) -
     return f'{backend_type}_{pure_model_name}_{communicator}_{parallel_str}_{quant_policy}{extra_params_case}'
 
 
-def parse_config_by_case(case_str: str) -> Dict[str, Any]:
+def parse_config_by_case(case_str: str) -> dict[str, Any]:
     """Parse run config dict from case name string (fix split & type convert
     bug)"""
     case_parts = case_str.split('_')
diff --git a/autotest/utils/evaluate_utils.py b/autotest/utils/evaluate_utils.py
index 3c0a0e91b7..8535e805bf 100644
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -83,7 +83,7 @@ def llm_summary(case_name, result, msg, work_dir, result_dir=None):
             if not os.path.exists(csv_file):
                 raise FileNotFoundError('CSV file does not exist')
 
-            with open(csv_file, 'r') as f:
+            with open(csv_file) as f:
                 reader = csv.reader(f)
                 next(reader)
                 for row in reader:
@@ -126,7 +126,7 @@ def mllm_summary(case_name,
         if dataset == 'OCRBench_MINI':
             score_file = f'{latest_dir}/{case_name}_{dataset}_score.json'
             cur_score = 0
-            with open(score_file, 'r') as f:
+            with open(score_file) as f:
                 total_score = json.load(f)
                 cur_score = total_score['Final Score Norm']
             metrics[dataset] = f'{cur_score:.2f}'  # noqa: E231
diff --git a/autotest/utils/mp_log_utils.py b/autotest/utils/mp_log_utils.py
index a80bbaa8ff..fdd7d4f1c9 100644
--- a/autotest/utils/mp_log_utils.py
+++ b/autotest/utils/mp_log_utils.py
@@ -22,7 +22,7 @@ def write_log(config, result, msg, is_new: bool = True, case_path_tag: str = 'de
 def assert_log(config, case_path_tag: str = 'default'):
     log_path = os.path.join(config.get('log_path'), case_path_tag)
 
-    with open(log_path, 'r') as f:
+    with open(log_path) as f:
         lines = f.readlines()
 
         for line in lines:
diff --git a/autotest/utils/pipeline_chat.py b/autotest/utils/pipeline_chat.py
index f3e6694840..77a45aa6df 100644
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
@@ -43,7 +43,7 @@ def run_pipeline_llm_test(config, run_config, common_case_config, worker_id: str
     with assume:
         assert result, stderr
 
-    with open(pipeline_log, 'r', encoding='utf-8') as file:
+    with open(pipeline_log, encoding='utf-8') as file:
         output_text = file.read()
 
     with open(pipeline_log, 'a') as file:
@@ -101,7 +101,7 @@ def run_pipeline_mllm_test(config, run_config, worker_id: str = '', is_smoke: bo
     with assume:
         assert result, stderr
 
-    with open(pipeline_log, 'r', encoding='utf-8') as file:
+    with open(pipeline_log, encoding='utf-8') as file:
         output_text = file.read()
 
     with open(pipeline_log, 'a') as file:
@@ -156,7 +156,7 @@ def run_pipeline_mllm_test(config, run_config, worker_id: str = '', is_smoke: bo
             if 'qwen' in model.lower():
                 Qwen_vl_testcase(output_text, file)
 
-    with open(pipeline_log, 'r', encoding='utf-8') as file:
+    with open(pipeline_log, encoding='utf-8') as file:
         output_text = file.read()
     print(output_text)
     allure.attach.file(pipeline_log, name=pipeline_log, attachment_type=allure.attachment_type.TEXT)
@@ -356,7 +356,7 @@ def assert_pipeline_common_log(config, log_name):
 
     msg = 'result is empty, please check again'
     result = False
-    with open(config_log, 'r') as f:
+    with open(config_log) as f:
         lines = f.readlines()
 
         for line in lines:
diff --git a/autotest/utils/proxy_distributed_utils.py b/autotest/utils/proxy_distributed_utils.py
index dc4efdebad..fa8afe7997 100644
--- a/autotest/utils/proxy_distributed_utils.py
+++ b/autotest/utils/proxy_distributed_utils.py
@@ -3,7 +3,7 @@
 import socket
 import subprocess
 import time
-from typing import Any, Dict, Tuple
+from typing import Any
 
 import requests
 from utils.config_utils import get_case_str_by_config, get_cli_common_param, resolve_extra_params
@@ -22,13 +22,13 @@ def is_port_open(host: str, port: int, timeout: float = 1.0) -> bool:
         try:
             s.connect((host, port))
             return True
-        except (socket.timeout, ConnectionRefusedError, OSError):
+        except (TimeoutError, ConnectionRefusedError, OSError):
             return False
 
 
 def check_nodes_status(host: str, proxy_port: int, model_name: str, expected_instances: int, check_count: int,
                        current_time: float, last_progress_print: float,
-                       progress_print_interval: int) -> Tuple[bool, int]:
+                       progress_print_interval: int) -> tuple[bool, int]:
     try:
         nodes_url = f'http://{host}:{proxy_port}/nodes/status'
         resp = requests.get(nodes_url, timeout=10)
@@ -215,7 +215,7 @@ def cleanup(self):
 
 class ApiServerPerTest:
 
-    def __init__(self, proxy_manager: ProxyDistributedManager, config: Dict[str, Any], run_config: Dict[str, Any]):
+    def __init__(self, proxy_manager: ProxyDistributedManager, config: dict[str, Any], run_config: dict[str, Any]):
         self.proxy_manager = proxy_manager
         self.config = config
         self.run_config = run_config
diff --git a/autotest/utils/ray_distributed_utils.py b/autotest/utils/ray_distributed_utils.py
index 2b87a4bb41..6b26c91a7b 100644
--- a/autotest/utils/ray_distributed_utils.py
+++ b/autotest/utils/ray_distributed_utils.py
@@ -4,7 +4,7 @@
 import subprocess
 import time
 from time import time as time_time
-from typing import Any, Dict
+from typing import Any
 
 import requests
 from utils.config_utils import get_case_str_by_config, get_cli_common_param, resolve_extra_params
@@ -252,7 +252,7 @@ def cleanup(self, force: bool = True):
                 print(f'⚠️ Ray stop exception: {e}')
             self._cleaned = True  # Only mark as "fully cleaned" when force=True
 
-    def get_cluster_info(self) -> Dict[str, Any]:
+    def get_cluster_info(self) -> dict[str, Any]:
         return {
             'node_rank': self.node_rank,
             'node_count': self.node_count,
diff --git a/autotest/utils/restful_return_check.py b/autotest/utils/restful_return_check.py
index 3dbbd2902a..b425b809da 100644
--- a/autotest/utils/restful_return_check.py
+++ b/autotest/utils/restful_return_check.py
@@ -46,16 +46,16 @@ def assert_usage(usage):
 def assert_logprobs(logprobs, logprobs_num):
     assert_logprob_element(logprobs)
     assert len(logprobs.get('top_logprobs')) >= 0
-    assert type(logprobs.get('top_logprobs')) == list
+    assert type(logprobs.get('top_logprobs')) is list
     assert len(logprobs.get('top_logprobs')) <= logprobs_num
     for logprob_element in logprobs.get('top_logprobs'):
         assert_logprob_element(logprob_element)
 
 
 def assert_logprob_element(logprob):
-    assert len(logprob.get('token')) > 0 and type(logprob.get('token')) == str
-    assert len(logprob.get('bytes')) > 0 and type(logprob.get('bytes')) == list
-    assert type(logprob.get('logprob')) == float
+    assert len(logprob.get('token')) > 0 and type(logprob.get('token')) is str
+    assert len(logprob.get('bytes')) > 0 and type(logprob.get('bytes')) is list
+    assert type(logprob.get('logprob')) is float
 
 
 def assert_chat_completions_stream_return(output,
diff --git a/autotest/utils/run_restful_chat.py b/autotest/utils/run_restful_chat.py
index 13192d37c5..94a605d176 100644
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
@@ -8,8 +8,13 @@
 import requests
 from openai import OpenAI
 from pytest_assume.plugin import assume
-from utils.config_utils import (get_case_str_by_config, get_cli_common_param, get_cuda_prefix_by_workerid, get_workerid,
-                                resolve_extra_params)
+from utils.config_utils import (
+    get_case_str_by_config,
+    get_cli_common_param,
+    get_cuda_prefix_by_workerid,
+    get_workerid,
+    resolve_extra_params,
+)
 from utils.constant import DEFAULT_PORT, DEFAULT_SERVER
 from utils.restful_return_check import assert_chat_completions_batch_return
 from utils.rule_condition_assert import assert_result
@@ -82,7 +87,7 @@ def start_openai_service(config, run_config, worker_id, timeout: int = 1200):
             # Check if process is still running
             return_code = startRes.wait(timeout=1)  # Small timeout to check status
             if return_code != 0:
-                with open(server_log, 'r') as f:
+                with open(server_log) as f:
                     content = f.read()
                     print(content)
                 return 0, content
@@ -576,8 +581,6 @@ def _run_tools_case(log_path, port: int = DEFAULT_PORT):
 
     timestamp = time.strftime('%Y%m%d_%H%M%S')
     restful_log = os.path.join(log_path, f'restful_toolcall_{model}_{str(port)}_{timestamp}.log')
-    file = open(restful_log, 'w')
-
     client = OpenAI(api_key='YOUR_API_KEY', base_url=http_url + '/v1')
     model_name = client.models.list().data[0].id
 
@@ -729,7 +732,7 @@ def start_proxy_server(log_path, port, case_name: str = 'default'):
             # Check if process is still running
             return_code = proxy_process.wait(timeout=1)  # Small timeout to check status
             if return_code != 0:
-                with open(proxy_log, 'r') as f:
+                with open(proxy_log) as f:
                     content = f.read()
                     print(content)
                 return 0, proxy_process
diff --git a/autotest/utils/toolkit.py b/autotest/utils/toolkit.py
index 7341c9d044..606609870f 100644
--- a/autotest/utils/toolkit.py
+++ b/autotest/utils/toolkit.py
@@ -1,5 +1,4 @@
 from functools import lru_cache
-from typing import List
 
 from transformers import AutoTokenizer
 
@@ -31,7 +30,7 @@ def _load_tokenizer_cached(model_path: str):
         raise RuntimeError(f"Failed to load tokenizer from '{model_path}': {e}")
 
 
-def encode_text(model_path: str, text: str) -> List[int]:
+def encode_text(model_path: str, text: str) -> list[int]:
     tokenizer = _load_tokenizer_cached(model_path)
 
     encoded = tokenizer.encode(text)
diff --git a/benchmark/benchmark_decode.py b/benchmark/benchmark_decode.py
index 3dd20a90c0..d74dadb908 100644
--- a/benchmark/benchmark_decode.py
+++ b/benchmark/benchmark_decode.py
@@ -5,9 +5,8 @@
 
 import fire
 import numpy as np
-from transformers import AutoTokenizer
-
 from lmdeploy.pytorch.decode import Engine
+from transformers import AutoTokenizer
 
 
 def benchmark(model_path, share_gpt_path, downsample=100, accel=None, save_to='decode_result'):
@@ -17,7 +16,7 @@ def benchmark(model_path, share_gpt_path, downsample=100, accel=None, save_to='d
     """
 
     start = time.monotonic()
-    content = json.load(open(share_gpt_path, 'r'))
+    content = json.load(open(share_gpt_path))
 
     texts = []
     for c in content:
diff --git a/benchmark/benchmark_pipeline.py b/benchmark/benchmark_pipeline.py
index 64a3deb721..63d2a2bee8 100644
--- a/benchmark/benchmark_pipeline.py
+++ b/benchmark/benchmark_pipeline.py
@@ -1,6 +1,5 @@
 import os
 import subprocess
-from typing import Dict, List
 
 import fire
 import yaml
@@ -48,9 +47,9 @@ def benchmark(model_path, backend, engine_config, data_config):
     tp = engine_config.get('tp', 1)
     output_file = f'benchmark_pipeline_{model_name}_{backend}_bs{bs}_tp{tp}_cache{cach_ratio}.csv'
     try:
-        if isinstance(data_config, Dict):
+        if isinstance(data_config, dict):
             data_config = [data_config]
-        assert isinstance(data_config, List) and all(isinstance(d, Dict) for d in data_config)
+        assert isinstance(data_config, list) and all(isinstance(d, dict) for d in data_config)
         for _data_config in data_config:
             _data_config['csv'] = output_file
             cmd = get_cmd(model_path, backend, engine_config, _data_config)
@@ -61,13 +60,13 @@ def benchmark(model_path, backend, engine_config, data_config):
 
 
 def main(model_path=None, backend=None, config_path=None):
-    with open(config_path, 'r') as f:
+    with open(config_path) as f:
         config = yaml.safe_load(f)
         engine_configs = config['engine']
         data_config = config['data']
-        if isinstance(engine_configs, Dict):
+        if isinstance(engine_configs, dict):
             engine_configs = [engine_configs]
-        assert isinstance(engine_configs, List) and all(isinstance(s, Dict) for s in engine_configs)
+        assert isinstance(engine_configs, list) and all(isinstance(s, dict) for s in engine_configs)
         for engine_config in engine_configs:
             # The model_path provided by the user will override the model_path in the config file.
             model_path = model_path or engine_config.pop('model_path')
diff --git a/benchmark/benchmark_serving.py b/benchmark/benchmark_serving.py
index 2527507d33..be6785cfd5 100644
--- a/benchmark/benchmark_serving.py
+++ b/benchmark/benchmark_serving.py
@@ -1,7 +1,6 @@
 import os
 import subprocess
 import time
-from typing import Dict, List, Optional, Tuple
 
 import fire
 import yaml
@@ -55,7 +54,7 @@ def get_output_file(model_path, backend, server_config):
     return f'benchmark_{model_name}_{backend}_{params_str}.csv'
 
 
-def get_server_ip_port(backend: str, server_config: Dict) -> Tuple[str, int]:
+def get_server_ip_port(backend: str, server_config: dict) -> tuple[str, int]:
     if backend in ['turbomind', 'pytorch']:
         if server_config.get('proxy_url'):
             # If proxy_url is set, we use the proxy server's IP and port
@@ -90,7 +89,7 @@ def wait_server_ready(server_ip: str, server_port: int) -> bool:
             time.sleep(5)
 
 
-def get_client_cmd(backend: str, server_ip: str, server_port: int, client_config: Dict) -> List[str]:
+def get_client_cmd(backend: str, server_ip: str, server_port: int, client_config: dict) -> list[str]:
     """Generate the client benchmark command."""
     current_dir = os.path.dirname(os.path.abspath(__file__))
     if backend in ['turbomind', 'pytorch']:
@@ -112,7 +111,7 @@ def get_client_cmd(backend: str, server_ip: str, server_port: int, client_config
     return cmd
 
 
-def benchmark(model_path: str, backend: str, server_config: Dict, data_config: Dict | List[Dict]):
+def benchmark(model_path: str, backend: str, server_config: dict, data_config: dict | list[dict]):
     """Benchmark the server with the given configuration.
 
     Args:
@@ -121,9 +120,9 @@ def benchmark(model_path: str, backend: str, server_config: Dict, data_config: D
         server_config: Configuration for the server and the inference engine.
         data_config: Configuration for the data.
     """
-    if isinstance(data_config, Dict):
+    if isinstance(data_config, dict):
         data_config = [data_config]
-    if not (isinstance(data_config, List) and all(isinstance(d, Dict) for d in data_config)):
+    if not (isinstance(data_config, list) and all(isinstance(d, dict) for d in data_config)):
         raise ValueError('data_config must be a dict or list of dicts')
 
     server_cmd = get_launching_server_cmd(model_path, backend, server_config)
@@ -166,7 +165,7 @@ def benchmark(model_path: str, backend: str, server_config: Dict, data_config: D
                 proc.kill()
 
 
-def validate_config(config: Dict) -> None:
+def validate_config(config: dict) -> None:
     """Validate the configuration structure.
 
     Args:
@@ -180,14 +179,14 @@ def validate_config(config: Dict) -> None:
         if section not in config:
             raise ValueError(f'Missing required config section: {section}')
 
-    if not isinstance(config['engine'], (Dict, List)):
+    if not isinstance(config['engine'], (dict, list)):
         raise ValueError('engine config must be a dict or list of dicts')
 
-    if not isinstance(config['data'], (Dict, List)):
+    if not isinstance(config['data'], (dict, list)):
         raise ValueError('data config must be a dict or list of dicts')
 
 
-def main(backend: str, config_path: str, model_path: Optional[str] = None):
+def main(backend: str, config_path: str, model_path: str | None = None):
     """Main entry point for the benchmark script.
 
     Args:
@@ -197,14 +196,14 @@ def main(backend: str, config_path: str, model_path: Optional[str] = None):
     Raises:
         BenchmarkConfigError: If required parameters are missing or config is invalid
     """
-    with open(config_path, 'r') as f:
+    with open(config_path) as f:
         config = yaml.safe_load(f)
         server_config = config['server']
         engine_configs = config['engine']
         data_config = config['data']
-        if isinstance(engine_configs, Dict):
+        if isinstance(engine_configs, dict):
             engine_configs = [engine_configs]
-        assert isinstance(engine_configs, List) and all(isinstance(s, Dict) for s in engine_configs)
+        assert isinstance(engine_configs, list) and all(isinstance(s, dict) for s in engine_configs)
         for engine_config in engine_configs:
             server_config = server_config.copy()
             server_config.update(engine_config)  # Merge engine config with server config
diff --git a/benchmark/benchmark_throughput.py b/benchmark/benchmark_throughput.py
index 49747d96c5..e8fc57d8f3 100644
--- a/benchmark/benchmark_throughput.py
+++ b/benchmark/benchmark_throughput.py
@@ -1,6 +1,5 @@
 import os
 import subprocess
-from typing import Dict, List
 
 import fire
 import yaml
@@ -48,9 +47,9 @@ def benchmark(model_path, backend, engine_config, data_config):
     tp = engine_config.get('tp', 1)
     output_file = f'benchmark_throughput_{model_name}_{backend}_bs{bs}_tp{tp}_cache{cach_ratio}.csv'
     try:
-        if isinstance(data_config, Dict):
+        if isinstance(data_config, dict):
             data_config = [data_config]
-        assert isinstance(data_config, List) and all(isinstance(d, Dict) for d in data_config)
+        assert isinstance(data_config, list) and all(isinstance(d, dict) for d in data_config)
         for _data_config in data_config:
             _data_config['csv'] = output_file
             cmd = get_cmd(model_path, backend, engine_config, _data_config)
@@ -61,13 +60,13 @@ def benchmark(model_path, backend, engine_config, data_config):
 
 
 def main(model_path=None, backend=None, config_path=None):
-    with open(config_path, 'r') as f:
+    with open(config_path) as f:
         config = yaml.safe_load(f)
         engine_configs = config['engine']
         data_config = config['data']
-        if isinstance(engine_configs, Dict):
+        if isinstance(engine_configs, dict):
             engine_configs = [engine_configs]
-        assert isinstance(engine_configs, List) and all(isinstance(s, Dict) for s in engine_configs)
+        assert isinstance(engine_configs, list) and all(isinstance(s, dict) for s in engine_configs)
         for engine_config in engine_configs:
             # The model_path provided by the user will override the model_path in the config file.
             model_path = model_path or engine_config.pop('model_path')
diff --git a/benchmark/profile_pipeline_api.py b/benchmark/profile_pipeline_api.py
index 57cef20384..e8a67e1fb8 100644
--- a/benchmark/profile_pipeline_api.py
+++ b/benchmark/profile_pipeline_api.py
@@ -3,7 +3,6 @@
 import json
 import os
 import random
-from typing import List, Optional, Tuple
 
 import numpy as np
 from tqdm import tqdm
@@ -21,8 +20,8 @@ def sample_sharegpt_requests(
     dataset_path: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int] = None,
-) -> List[Tuple[str, int, int]]:
+    fixed_output_len: int | None = None,
+) -> list[tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError('output_len too small')
 
@@ -38,7 +37,7 @@ def sample_sharegpt_requests(
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
+    filtered_dataset: list[tuple[str, int, int]] = []
     for i in range(len(dataset)):
         if len(filtered_dataset) == num_requests:
             break
@@ -70,7 +69,7 @@ def sample_random_requests(
     range_ratio: float,
     tokenizer: PreTrainedTokenizerBase,
     dataset_path: str,
-) -> List[Tuple[str, int, int]]:
+) -> list[tuple[str, int, int]]:
 
     input_lens = np.random.randint(
         max(int(input_len * range_ratio), 1),
@@ -101,7 +100,7 @@ def sample_random_requests(
         random.shuffle(dataset)
 
         # Filter out sequences that are too long or too short
-        input_requests: List[Tuple[str, int, int]] = []
+        input_requests: list[tuple[str, int, int]] = []
         for i in range(num_prompts):
             # Tokenize the prompts and completions.
             prompt = dataset[i][0]
@@ -150,7 +149,7 @@ def process_request(self, requests, profiler: Profiler, temperature, top_p, top_
                              max_new_tokens=output_len) for _, _, output_len in requests
         ]
 
-        sess: List[Session] = []
+        sess: list[Session] = []
         for _, input_len, output_len in requests:
             sess.append(profiler.new_session(input_len, output_len))
 
diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index 127e420125..34a8853b52 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -22,9 +22,10 @@
 import traceback
 import warnings
 from argparse import ArgumentParser
+from collections.abc import AsyncGenerator
 from dataclasses import dataclass, field
 from datetime import datetime
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
+from typing import Any
 
 import aiohttp
 import numpy as np
@@ -32,8 +33,13 @@
 import requests
 from PIL import Image
 from tqdm.asyncio import tqdm
-from transformers import (AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerBase,
-                          PreTrainedTokenizerFast)
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+)
 
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=None)
 
@@ -58,8 +64,8 @@ class RequestFuncInput:
     prompt_len: int
     output_len: int
     model: str
-    image_data: Optional[List[str]]
-    extra_request_body: Dict[str, Any]
+    image_data: list[str] | None
+    extra_request_body: dict[str, Any]
 
 
 @dataclass
@@ -68,7 +74,7 @@ class RequestFuncOutput:
     success: bool = False
     latency: float = 0.0
     ttft: float = 0.0  # Time to first token
-    itl: List[float] = field(default_factory=list)  # List of inter-token latencies
+    itl: list[float] = field(default_factory=list)  # List of inter-token latencies
     prompt_len: int = 0
     output_len: int = 0
     error: str = ''
@@ -82,7 +88,7 @@ def remove_prefix(text: str, prefix: str) -> str:
 # https://github.com/triton-inference-server/tensorrtllm_backend/issues/505
 async def async_request_trt_llm(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith('generate_stream')
@@ -152,7 +158,7 @@ async def async_request_trt_llm(
 # set ignore_eos True by default
 async def async_request_openai_completions(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith('completions'), "OpenAI Completions API URL must end with 'completions'."
@@ -230,7 +236,7 @@ async def async_request_openai_completions(
 
 async def async_request_openai_chat_completions(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     assert api_url.endswith('chat/completions'), "OpenAI Chat Completions API URL must end with 'chat/completions'."
@@ -338,7 +344,7 @@ async def async_request_openai_chat_completions(
 
 async def async_request_sglang_generate(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     api_url = request_func_input.api_url
     prompt = request_func_input.prompt
@@ -415,7 +421,7 @@ async def async_request_sglang_generate(
 
 async def async_request_gserver(
     request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
+    pbar: tqdm | None = None,
 ) -> RequestFuncOutput:
     raise NotImplementedError()
 
@@ -435,7 +441,7 @@ def get_model(pretrained_model_name_or_path: str) -> str:
     return pretrained_model_name_or_path
 
 
-def get_tokenizer(pretrained_model_name_or_path: str, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+def get_tokenizer(pretrained_model_name_or_path: str, ) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
     if pretrained_model_name_or_path.endswith('.json') or pretrained_model_name_or_path.endswith('.model'):
         from sglang.srt.hf_transformers_utils import get_tokenizer
 
@@ -446,7 +452,7 @@ def get_tokenizer(pretrained_model_name_or_path: str, ) -> Union[PreTrainedToken
     return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
 
 
-def get_processor(pretrained_model_name_or_path: str, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+def get_processor(pretrained_model_name_or_path: str, ) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
     assert (pretrained_model_name_or_path is not None and pretrained_model_name_or_path != '')
     if pretrained_model_name_or_path.endswith('.json') or pretrained_model_name_or_path.endswith('.model'):
         from sglang.srt.utils.hf_transformers_utils import get_processor
@@ -503,7 +509,7 @@ class BenchmarkMetrics:
 SHAREGPT_URL = 'https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json'  # noqa
 
 
-def download_and_cache_file(url: str, filename: Optional[str] = None):
+def download_and_cache_file(url: str, filename: str | None = None):
     """Read and cache a file from a url."""
     if filename is None:
         filename = os.path.join('/tmp', url.split('/')[-1])
@@ -542,9 +548,9 @@ class DatasetRow:
     prompt: str
     prompt_len: int
     output_len: int
-    text_prompt_len: Optional[int] = None
-    vision_prompt_len: Optional[int] = None
-    image_data: Optional[List[str]] = None
+    text_prompt_len: int | None = None
+    vision_prompt_len: int | None = None
+    image_data: list[str] | None = None
 
     def __post_init__(self):
         if self.text_prompt_len is None:
@@ -556,7 +562,7 @@ def __post_init__(self):
 def sample_sharegpt_requests(dataset_path: str,
                              num_requests: int,
                              tokenizer: PreTrainedTokenizerBase,
-                             fixed_output_len: Optional[int] = None) -> List[DatasetRow]:
+                             fixed_output_len: int | None = None) -> list[DatasetRow]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError('output_len too small')
 
@@ -576,7 +582,7 @@ def sample_sharegpt_requests(dataset_path: str,
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
-    filtered_dataset: List[DatasetRow] = []
+    filtered_dataset: list[DatasetRow] = []
     for i in range(len(dataset)):
         if len(filtered_dataset) == num_requests:
             break
@@ -621,7 +627,7 @@ def sample_random_requests(
     range_ratio: float,
     tokenizer: PreTrainedTokenizerBase,
     dataset_path: str,
-) -> List[DatasetRow]:
+) -> list[DatasetRow]:
 
     input_lens = compute_random_lens(
         full_len=input_len,
@@ -655,8 +661,8 @@ def sample_random_requests(
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
-    input_requests: List[DatasetRow] = []
-    origin_output_lens: List[int] = []
+    input_requests: list[DatasetRow] = []
+    origin_output_lens: list[int] = []
     for i in range(num_prompts):
         # Tokenize the prompts and completions.
         prompt = dataset[i][0]
@@ -683,7 +689,7 @@ def sample_random_requests(
     return input_requests
 
 
-def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
+def parse_image_resolution(image_resolution: str) -> tuple[int, int]:
     """Parse image resolution into (width, height).
 
     Supports presets '1080p', '720p', '360p'. And custom 'heightxwidth' format (e.g., '1080x1920' means height=1080,
@@ -802,7 +808,7 @@ def sample_image_requests(
     image_format: str,
     image_resolution: str,
     backend: str,
-) -> List[DatasetRow]:
+) -> list[DatasetRow]:
     """Generate requests with images.
 
     - Each request includes ``image_count`` images.
@@ -836,7 +842,7 @@ def sample_image_requests(
         num=num_requests,
     )
 
-    def _gen_random_image_data_uri(width: int = width, height: int = height) -> Tuple[Image.Image, str, int]:
+    def _gen_random_image_data_uri(width: int = width, height: int = height) -> tuple[Image.Image, str, int]:
         if image_content == 'blank':
             # Generate blank white image
             arr = np.full((height, width, 3), 255, dtype=np.uint8)
@@ -851,7 +857,7 @@ def _gen_random_image_data_uri(width: int = width, height: int = height) -> Tupl
         image_bytes = len(image_data.encode('utf-8'))
         return img, image_data, image_bytes
 
-    dataset: List[DatasetRow] = []
+    dataset: list[DatasetRow] = []
     total_image_bytes = 0
     for i in range(num_requests):
         # Generate text prompt
@@ -885,7 +891,7 @@ def _gen_random_image_data_uri(width: int = width, height: int = height) -> Tupl
 
 
 async def get_request(
-    input_requests: List[DatasetRow],
+    input_requests: list[DatasetRow],
     request_rate: float,
 ) -> AsyncGenerator[DatasetRow, None]:
     input_requests = iter(input_requests)
@@ -903,22 +909,22 @@ async def get_request(
 
 
 def calculate_metrics(
-    input_requests: List[DatasetRow],
-    outputs: List[RequestFuncOutput],
+    input_requests: list[DatasetRow],
+    outputs: list[RequestFuncOutput],
     dur_s: float,
     tokenizer: PreTrainedTokenizerBase,
     backend: str,
-) -> Tuple[BenchmarkMetrics, List[int]]:
-    output_lens: List[int] = []
-    retokenized_output_lens: List[int] = []
+) -> tuple[BenchmarkMetrics, list[int]]:
+    output_lens: list[int] = []
+    retokenized_output_lens: list[int] = []
     total_input = 0
     total_input_text = 0
     total_input_vision = 0
     completed = 0
-    itls: List[float] = []
-    tpots: List[float] = []
-    ttfts: List[float] = []
-    e2e_latencies: List[float] = []
+    itls: list[float] = []
+    tpots: list[float] = []
+    ttfts: list[float] = []
+    e2e_latencies: list[float] = []
 
     for i in range(len(outputs)):
         if outputs[i].success:
@@ -982,10 +988,10 @@ async def benchmark(
     api_url: str,
     model_id: str,
     tokenizer: PreTrainedTokenizerBase,
-    input_requests: List[DatasetRow],
+    input_requests: list[DatasetRow],
     request_rate: float,
     disable_tqdm: bool,
-    extra_request_body: Dict[str, Any],
+    extra_request_body: dict[str, Any],
 ):
     if backend in ASYNC_REQUEST_FUNCS:
         request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -1018,7 +1024,7 @@ async def benchmark(
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 
     benchmark_start_time = time.perf_counter()
-    tasks: List[asyncio.Task] = []
+    tasks: list[asyncio.Task] = []
     async for request in get_request(input_requests, request_rate):
         request_func_input = RequestFuncInput(
             model=model_id,
@@ -1030,7 +1036,7 @@ async def benchmark(
             extra_request_body=extra_request_body,
         )
         tasks.append(asyncio.create_task(request_func(request_func_input=request_func_input, pbar=pbar)))
-    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
 
     if pbar is not None:
         pbar.close()
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
index 78f545072b..c56b14d5c1 100644
--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
@@ -5,7 +5,6 @@
 import os
 import random
 from queue import Queue
-from typing import List, Optional, Tuple, Union
 
 import numpy as np
 from tqdm import tqdm
@@ -25,8 +24,8 @@ def sample_sharegpt_requests(
     dataset_path: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int] = None,
-) -> List[Tuple[str, int, int]]:
+    fixed_output_len: int | None = None,
+) -> list[tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError('output_len too small')
     # Load the dataset.
@@ -41,7 +40,7 @@ def sample_sharegpt_requests(
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
+    filtered_dataset: list[tuple[str, int, int]] = []
     for i in range(len(dataset)):
         if len(filtered_dataset) == num_requests:
             break
@@ -73,7 +72,7 @@ def sample_random_requests(
     range_ratio: float,
     tokenizer: PreTrainedTokenizerBase,
     dataset_path: str,
-) -> List[Tuple[str, int, int]]:
+) -> list[tuple[str, int, int]]:
 
     input_lens = np.random.randint(
         max(int(input_len * range_ratio), 1),
@@ -104,7 +103,7 @@ def sample_random_requests(
         random.shuffle(dataset)
 
         # Filter out sequences that are too long or too short
-        input_requests: List[Tuple[str, int, int]] = []
+        input_requests: list[tuple[str, int, int]] = []
         for i in range(num_prompts):
             # Tokenize the prompts and completions.
             prompt = dataset[i][0]
@@ -134,7 +133,7 @@ def sample_random_requests(
 
 class Engine:
 
-    def __init__(self, model_path: str, engine_config: Union[PytorchEngineConfig, TurbomindEngineConfig]):
+    def __init__(self, model_path: str, engine_config: PytorchEngineConfig | TurbomindEngineConfig):
         self.tokenizer = Tokenizer(model_path)
         if isinstance(engine_config, TurbomindEngineConfig):
             from lmdeploy.turbomind import TurboMind
diff --git a/docs/en/conf.py b/docs/en/conf.py
index 94ca2a4def..095173d32b 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -25,7 +25,7 @@
 from lmdeploy.serve.proxy.proxy import app as proxy_server  # noqa: E402
 
 version_file = '../../lmdeploy/version.py'
-with open(version_file, 'r') as f:
+with open(version_file) as f:
     exec(compile(f.read(), version_file, 'exec'))
 __version__ = locals()['__version__']
 
diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py
index 5db30be50d..202fb138a5 100644
--- a/docs/zh_cn/conf.py
+++ b/docs/zh_cn/conf.py
@@ -25,7 +25,7 @@
 from lmdeploy.serve.proxy.proxy import app as proxy_server  # noqa: E402
 
 version_file = '../../lmdeploy/version.py'
-with open(version_file, 'r') as f:
+with open(version_file) as f:
     exec(compile(f.read(), version_file, 'exec'))
 __version__ = locals()['__version__']
 
diff --git a/eval/eval.py b/eval/eval.py
index 53a2bdb9af..c0b4ea2dd6 100644
--- a/eval/eval.py
+++ b/eval/eval.py
@@ -66,7 +66,7 @@ def read_config():
 
     # Read config file content
     try:
-        with open(config_path, 'r', encoding='utf-8') as f:
+        with open(config_path, encoding='utf-8') as f:
             config_content = f.read()
         return config_content
     except FileNotFoundError:
diff --git a/lmdeploy/api.py b/lmdeploy/api.py
index 4f0ff34315..11f31c1de4 100644
--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, List, Literal
+from typing import TYPE_CHECKING, Literal
 
 from typing_extensions import deprecated
 
@@ -13,13 +13,14 @@
 
 
 def pipeline(model_path: str,
-             backend_config: 'TurbomindEngineConfig' | 'PytorchEngineConfig' | None = None,
-             chat_template_config: 'ChatTemplateConfig' | None = None,
+             backend_config: TurbomindEngineConfig | PytorchEngineConfig | None = None,
+             chat_template_config: ChatTemplateConfig | None = None,
              log_level: str = 'WARNING',
              max_log_len: int | None = None,
-             speculative_config: 'SpeculativeConfig' | None = None,
+             speculative_config: SpeculativeConfig | None = None,
              **kwargs):
-    """
+    """Create a pipeline for inference.
+
     Args:
         model_path: the path of a model. It could be one of the following options:
 
@@ -34,14 +35,17 @@ def pipeline(model_path: str,
               on huggingface.co, such as ``internlm/internlm-chat-7b``,
               ``Qwen/Qwen-7B-Chat``, ``baichuan-inc/Baichuan2-7B-Chat``
               and so on.
-        backend_config: backend
-            config instance. Default to None.
-        chat_template_config: chat template configuration.
-            Default to None.
+        backend_config: backend config instance. Default to None.
+        chat_template_config: chat template configuration. Default to None.
         log_level: set log level whose value among [``CRITICAL``, ``ERROR``,
             ``WARNING``, ``INFO``, ``DEBUG``]
         max_log_len: Max number of prompt characters or prompt tokens
-            being printed in log
+            being printed in log.
+        speculative_config: speculative decoding configuration.
+        **kwargs: additional keyword arguments passed to the pipeline.
+
+    Returns:
+        Pipeline: a pipeline instance for inference.
 
     Examples:
 
@@ -62,8 +66,7 @@ def pipeline(model_path: str,
             im = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
             response = pipe([('describe this image', [im])])
             print(response)
-
-    """ # noqa E501
+    """  # noqa E501
 
     return Pipeline(model_path,
                     backend_config=backend_config,
@@ -78,12 +81,12 @@ def pipeline(model_path: str,
 def serve(model_path: str,
           model_name: str | None = None,
           backend: Literal['turbomind', 'pytorch'] = 'turbomind',
-          backend_config: 'TurbomindEngineConfig' | 'PytorchEngineConfig' | None = None,
-          chat_template_config: 'ChatTemplateConfig' | None = None,
+          backend_config: TurbomindEngineConfig | PytorchEngineConfig | None = None,
+          chat_template_config: ChatTemplateConfig | None = None,
           server_name: str = '0.0.0.0',
           server_port: int = 23333,
           log_level: str = 'ERROR',
-          api_keys: List[str] | str | None = None,
+          api_keys: list[str] | str | None = None,
           ssl: bool = False,
           **kwargs):
     """This function is deprecated and no longer available.
@@ -106,11 +109,13 @@ def client(api_server_url: str = 'http://0.0.0.0:23333', api_key: str | None = N
 
     Args:
         api_server_url: communicating address ``http://<ip>:<port>`` of
-            api_server
+            api_server.
         api_key: api key. Default to None, which means no
             api key will be used.
-    Return:
-        Chatbot for LLaMA series models with turbomind as inference engine.
+
+    Raises:
+        NotImplementedError: This function has been deprecated and removed.
+            Use ``from lmdeploy.serve import APIClient`` instead.
     """
     raise NotImplementedError("The 'client' function is no longer available. This function has been deprecated. "
                               ' Please use "from lmdeploy.serve import APIClient" instead.')
diff --git a/lmdeploy/archs.py b/lmdeploy/archs.py
index a4fe0d2333..80707cd41f 100644
--- a/lmdeploy/archs.py
+++ b/lmdeploy/archs.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
-from typing import Dict, List, Literal, Tuple
+from typing import Literal
 
 from transformers import AutoConfig
 
@@ -58,7 +58,7 @@ def autoget_backend(model_path: str) -> Literal['turbomind', 'pytorch']:
 def autoget_backend_config(
     model_path: str,
     backend_config: PytorchEngineConfig | TurbomindEngineConfig | None = None
-) -> Tuple[Literal['turbomind', 'pytorch'], PytorchEngineConfig | TurbomindEngineConfig]:
+) -> tuple[Literal['turbomind', 'pytorch'], PytorchEngineConfig | TurbomindEngineConfig]:
     """Get backend config automatically.
 
     Args:
@@ -78,7 +78,7 @@ def autoget_backend_config(
     backend = autoget_backend(model_path)
     config = PytorchEngineConfig() if backend == 'pytorch' else TurbomindEngineConfig()
     if backend_config is not None:
-        if type(backend_config) == type(config):
+        if type(backend_config) is type(config):
             config = backend_config
         else:
             data = asdict(backend_config)
@@ -176,15 +176,15 @@ def get_model_arch(model_path: str):
 def search_nested_config(config, key):
     """Recursively searches for the value associated with the given key in a
     nested configuration of a model."""
-    if isinstance(config, Dict):
+    if isinstance(config, dict):
         for k, v in config.items():
             if k == key:
                 return v
-            if isinstance(v, (Dict, List)):
+            if isinstance(v, (dict, list)):
                 result = search_nested_config(v, key)
                 if result is not None:
                     return result
-    elif isinstance(config, List):
+    elif isinstance(config, list):
         for item in config:
             result = search_nested_config(item, key)
             if result is not None:
diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
index 76adaca501..e0702aeeaf 100644
--- a/lmdeploy/cli/cli.py
+++ b/lmdeploy/cli/cli.py
@@ -3,11 +3,16 @@
 import os
 
 from ..version import __version__
-from .utils import (ArgumentHelper, DefaultsAndTypesHelpFormatter, FlexibleArgumentParser, convert_args,
-                    get_speculative_config)
+from .utils import (
+    ArgumentHelper,
+    DefaultsAndTypesHelpFormatter,
+    FlexibleArgumentParser,
+    convert_args,
+    get_speculative_config,
+)
 
 
-class CLI(object):
+class CLI:
     _desc = 'The CLI provides a unified API for converting, ' \
             'compressing and deploying large language models.'
     parser = FlexibleArgumentParser(prog='lmdeploy', description=_desc, add_help=True)
@@ -124,8 +129,7 @@ def get_gpu_topo():
             if sys.platform.startswith('linux'):
                 try:
                     res = subprocess.run(['nvidia-smi', 'topo', '-m'],
-                                         stdout=subprocess.PIPE,
-                                         stderr=subprocess.PIPE,
+                                         capture_output=True,
                                          text=True,
                                          check=True)
                     if res.returncode == 0:
diff --git a/lmdeploy/cli/lite.py b/lmdeploy/cli/lite.py
index 768ef47544..0143453aab 100644
--- a/lmdeploy/cli/lite.py
+++ b/lmdeploy/cli/lite.py
@@ -3,7 +3,7 @@
 from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args
 
 
-class SubCliLite(object):
+class SubCliLite:
     """CLI for compressing LLMs."""
     _help = 'Compressing and accelerating LLMs with lmdeploy.lite module'
     _desc = _help
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index 3488281f42..155392f4a7 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -3,8 +3,14 @@
 from lmdeploy.utils import get_max_batch_size
 
 from .cli import CLI
-from .utils import (ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args, get_chat_template, get_lora_adapters,
-                    get_speculative_config)
+from .utils import (
+    ArgumentHelper,
+    DefaultsAndTypesHelpFormatter,
+    convert_args,
+    get_chat_template,
+    get_lora_adapters,
+    get_speculative_config,
+)
 
 
 class SubCliServe:
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index 2ccc2c6f2a..6dfeb61aa3 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -5,7 +5,7 @@
 import re
 import sys
 from collections import defaultdict
-from typing import Any, List
+from typing import Any
 
 from lmdeploy.utils import get_logger
 
@@ -39,14 +39,14 @@ def convert_args(args):
     return kwargs
 
 
-def get_lora_adapters(adapters: List[str]):
+def get_lora_adapters(adapters: list[str]):
     """Parse lora adapers from cli input.
 
     Args:
-        adapters (List[str]): CLI input string of lora adapter path(s).
+        adapters (list[str]): CLI input string of lora adapter path(s).
 
     Returns:
-        Dict[str,str] or None: Parsed lora adapter path(s).
+        dict[str, str] | None: Parsed lora adapter path(s).
     """
     if not adapters:
         return None
@@ -435,7 +435,7 @@ def calib_search_scale(parser):
         )
 
     @staticmethod
-    def device(parser, default: str = 'cuda', choices: List[str] = ['cuda', 'ascend', 'maca', 'camb']):
+    def device(parser, default: str = 'cuda', choices: list[str] = ['cuda', 'ascend', 'maca', 'camb']):
         """Add argument device to parser."""
 
         return parser.add_argument('--device',
diff --git a/lmdeploy/lite/apis/calibrate.py b/lmdeploy/lite/apis/calibrate.py
index e8dd1fca23..b5553df10b 100644
--- a/lmdeploy/lite/apis/calibrate.py
+++ b/lmdeploy/lite/apis/calibrate.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from pathlib import Path
-from typing import Literal, Union
+from typing import Literal
 
 import torch
 from torch import nn
@@ -76,7 +76,7 @@
 
 
 def _prepare_for_calibrate(model: nn.Module,
-                           layer_type: Union[str, type],
+                           layer_type: str | type,
                            head_name: str = 'lm_head',
                            device: str = 'cuda',
                            prefix: str = '') -> None:
@@ -95,7 +95,7 @@ def _prepare_for_calibrate(model: nn.Module,
     ----------
     model : nn.Module
         The PyTorch model to prepare for calibration.
-    layer_type : Union[str, Type]
+    layer_type : str | type
         The type of the layer to be moved to CPU. Can be either a string of
         class name or the class type itself.
     head_name : str, optional
diff --git a/lmdeploy/lite/apis/get_small_sharded_hf.py b/lmdeploy/lite/apis/get_small_sharded_hf.py
index 7c5ce8eba3..2d1bebaac1 100644
--- a/lmdeploy/lite/apis/get_small_sharded_hf.py
+++ b/lmdeploy/lite/apis/get_small_sharded_hf.py
@@ -41,7 +41,7 @@ def main():
         state_dict = torch.load(os.path.join(args.src_dir, ckpt), map_location='cuda', weights_only=True)
         keys = sorted(list(state_dict.keys()))
         for k in keys:
-            new_state_dict_name = 'pytorch_model-{:05d}-of-{:05d}.bin'.format(cnt, n_shard)
+            new_state_dict_name = f'pytorch_model-{cnt:05d}-of-{n_shard:05d}.bin'
             new_index['weight_map'][k] = new_state_dict_name
             new_state_dict = {k: state_dict[k]}
             torch.save(new_state_dict, os.path.join(args.dst_dir, new_state_dict_name))
diff --git a/lmdeploy/lite/quantization/awq.py b/lmdeploy/lite/quantization/awq.py
index c1acafe601..8ee3990409 100644
--- a/lmdeploy/lite/quantization/awq.py
+++ b/lmdeploy/lite/quantization/awq.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
 
 import torch
 
@@ -151,7 +150,7 @@ def get_weight_scale(weight, q_group_size=-1):
 
 @torch.no_grad()
 def smooth_ln_fcs(ln: torch.nn.Module,
-                  fcs: List[torch.nn.Module],
+                  fcs: list[torch.nn.Module],
                   act_scales: torch.Tensor,
                   group_size: int = -1,
                   alpha: float = 0.5) -> torch.Tensor:
@@ -204,7 +203,7 @@ def smooth_ln_fcs(ln: torch.nn.Module,
 
 @torch.no_grad()
 def smooth_fc_fcs(pre_fc: torch.nn.Module,
-                  fcs: List[torch.nn.Module],
+                  fcs: list[torch.nn.Module],
                   act_scales: torch.Tensor,
                   group_size: int = -1,
                   alpha: float = 0.5) -> torch.Tensor:
diff --git a/lmdeploy/lite/quantization/calibration.py b/lmdeploy/lite/quantization/calibration.py
index de83f29d87..44575d9c7b 100644
--- a/lmdeploy/lite/quantization/calibration.py
+++ b/lmdeploy/lite/quantization/calibration.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from functools import partial
-from typing import Union
 
 import torch
 from torch import nn
@@ -9,11 +8,15 @@
 
 from lmdeploy.lite.quantization.activation import ActivationObserver
 from lmdeploy.lite.quantization.awq import FC_FCS_MAP, NORM_FCS_MAP
-from lmdeploy.lite.utils import (bimap_name_mod, collect_target_modules, concat_decoder_layer_outputs,
-                                 split_decoder_layer_inputs)
+from lmdeploy.lite.utils import (
+    bimap_name_mod,
+    collect_target_modules,
+    concat_decoder_layer_outputs,
+    split_decoder_layer_inputs,
+)
 
 
-class CalibrationContext():
+class CalibrationContext:
     """Calibration context manager for model quantization.
 
     Parameters:
@@ -30,8 +33,8 @@ class CalibrationContext():
     def __init__(self,
                  model: nn.Module,
                  tokenizer: PreTrainedTokenizer,
-                 layer_type: Union[str, type],
-                 norm_type: Union[str, type],
+                 layer_type: str | type,
+                 norm_type: str | type,
                  batch_size: int = 1,
                  device: str = 'cuda',
                  **kwargs) -> None:
@@ -40,8 +43,8 @@ def __init__(self,
         Args:
             model (nn.Module): Model to be calibrated.
             tokenizer (PreTrainedTokenizer): Tokenizer of the given model.
-            layer_type (Union[str, type]): Type of the layers to be observed.
-            norm_type (Union[str, type]): Norm type used in the model.
+            layer_type (str | type): Type of the layers to be observed.
+            norm_type (str | type): Norm type used in the model.
             batch_size (int): The batch size for running the calib samples.
                 Low GPU mem requires small batch_size. Large batch_size
                 reduces the calibration time while costs more VRAM.
@@ -201,7 +204,7 @@ def export(self, out_dir):
         to specified directory.
 
         Args:
-            out_dir (Union[str, Path]): The directory path where the stats
+            out_dir (str | Path): The directory path where the stats
                 will be saved.
         """
 
@@ -339,8 +342,8 @@ class CalibrationContextV2(CalibrationContext):
     def __init__(self,
                  model: nn.Module,
                  tokenizer: PreTrainedTokenizer,
-                 layer_type: Union[str, type],
-                 norm_type: Union[str, type],
+                 layer_type: str | type,
+                 norm_type: str | type,
                  batch_size: int = 1,
                  device: str = 'cuda',
                  search_scale: bool = True,
@@ -374,7 +377,7 @@ def export(self, out_dir):
         to specified directory.
 
         Args:
-            out_dir (Union[str, Path]): The directory path where the stats
+            out_dir (str | Path): The directory path where the stats
                 will be saved.
         """
         inputs_stats = {
diff --git a/lmdeploy/lite/quantization/modules/linear.py b/lmdeploy/lite/quantization/modules/linear.py
index 854d4cc51f..8041020201 100644
--- a/lmdeploy/lite/quantization/modules/linear.py
+++ b/lmdeploy/lite/quantization/modules/linear.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Type, TypeVar
+from typing import TypeVar
 
 import torch
 from torch import nn
@@ -22,14 +22,14 @@ class WeightOnlyQLinear(nn.Module):
         group_size (int): size of the quantization group.
         in_features (int): size of each input sample.
         out_features (int): size of each output sample.
-        bias (Tensor, optional): Defaults to None.
+        bias (bool): Defaults to True.
     """
 
     def __init__(
         self,
         in_features: int,
         out_features: int,
-        bias: Optional[torch.Tensor] = True,
+        bias: bool = True,
         w_bit: int = 4,
         symmetry: bool = False,
         group_size: int = 128,
@@ -71,11 +71,11 @@ def __init__(
             self.qzeros = None
 
     @classmethod
-    def from_linear(cls: Type['WeightOnlyQLinear'],
+    def from_linear(cls: type['WeightOnlyQLinear'],
                     linear: nn.Linear,
                     quantizer: TypeVar('Quantizer'),
                     awq_layout: bool = True,
-                    qparams: Optional[QParams] = None) -> 'WeightOnlyQLinear':
+                    qparams: QParams | None = None) -> 'WeightOnlyQLinear':
         """Create a WeightOnlyQLinear object from a PyTorch Linear object.
 
         Args:
diff --git a/lmdeploy/lite/quantization/weight/quant_utils.py b/lmdeploy/lite/quantization/weight/quant_utils.py
index 934a569578..1d873b6ed3 100644
--- a/lmdeploy/lite/quantization/weight/quant_utils.py
+++ b/lmdeploy/lite/quantization/weight/quant_utils.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Sequence, Union
+from collections.abc import Sequence
 
 import torch
 
@@ -29,8 +29,8 @@ def fast_round_scale_torch(amax: torch.Tensor, fp8_max: torch.Tensor) -> torch.T
 
 def _get_quant_scaling(weight: torch.Tensor,
                        fp8_dtype: torch.dtype,
-                       dim: Union[int, Sequence[int]],
-                       scale_fmt: Optional[str] = None):
+                       dim: int | Sequence[int],
+                       scale_fmt: str | None = None):
     """Get the scaling factor for FP8 quantization."""
     finfo = torch.finfo(fp8_dtype)
     fmax = finfo.max
@@ -47,7 +47,7 @@ def _get_quant_scaling(weight: torch.Tensor,
 def quant_blocked_fp8(weight: torch.Tensor,
                       fp8_dtype: torch.dtype,
                       block_size: int = 128,
-                      scale_fmt: Optional[str] = None):
+                      scale_fmt: str | None = None):
     """Quantize the weight tensor to blocked FP8 format."""
     assert scale_fmt in (None, 'ue8m0'), f'Unsupported scale_fmt: {scale_fmt}'
 
diff --git a/lmdeploy/lite/quantization/weight/quantizer.py b/lmdeploy/lite/quantization/weight/quantizer.py
index 0e492ad413..2bbc3fd122 100644
--- a/lmdeploy/lite/quantization/weight/quantizer.py
+++ b/lmdeploy/lite/quantization/weight/quantizer.py
@@ -1,12 +1,19 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Callable, Dict, Optional
+from collections.abc import Callable
 
 import torch
 
-from lmdeploy.lite.utils import (QParams, cal_qparams_per_channel_absmax, cal_qparams_per_channel_minmax,
-                                 cal_qparams_per_group_absmax, cal_qparams_per_group_minmax,
-                                 cal_qparams_per_tensor_absmax, cal_qparams_per_tensor_minmax, precise_round)
+from lmdeploy.lite.utils import (
+    QParams,
+    cal_qparams_per_channel_absmax,
+    cal_qparams_per_channel_minmax,
+    cal_qparams_per_group_absmax,
+    cal_qparams_per_group_minmax,
+    cal_qparams_per_tensor_absmax,
+    cal_qparams_per_tensor_minmax,
+    precise_round,
+)
 from lmdeploy.lite.utils.global_avail import GlobalAvailMixin
 
 
@@ -24,7 +31,7 @@ class WeightQuantizer(GlobalAvailMixin):
             use min-max scaling.
         granularity (str): The granularity of quantization. Available options
             are 'per_channel', 'per_tensor', and 'per_group'.
-        group_size (Optional[int]): If using 'per_group' quantization, this is
+        group_size (int | None): If using 'per_group' quantization, this is
             the number of channels in each group.
 
     Example:
@@ -41,7 +48,7 @@ class WeightQuantizer(GlobalAvailMixin):
         quantized_weights = quantizer.fake_quant(weights, qparams)
     """
 
-    CAL_FUNC_MAP: Dict[str, Dict[str, Callable]] = {
+    CAL_FUNC_MAP: dict[str, dict[str, Callable]] = {
         'per_group': {
             'absmax': cal_qparams_per_group_absmax,
             'minmax': cal_qparams_per_group_minmax,
@@ -56,7 +63,7 @@ class WeightQuantizer(GlobalAvailMixin):
         },
     }
 
-    def __init__(self, bits: int, symmetry: bool, granularity: str, group_size: Optional[int] = -1):
+    def __init__(self, bits: int, symmetry: bool, granularity: str, group_size: int | None = -1):
 
         assert bits in [4, 8], "The 'bits' argument must be either 4 or 8."
         self.bits = bits
@@ -95,13 +102,13 @@ def calculate_qparams(self, weight: torch.Tensor) -> QParams:
         else:
             return cal_func(weight, self.bits)
 
-    def quant(self, weight: torch.Tensor, qparams: Optional[QParams] = None, real: bool = False) -> torch.Tensor:
+    def quant(self, weight: torch.Tensor, qparams: QParams | None = None, real: bool = False) -> torch.Tensor:
         """Perform fake quantization on the given weight tensor.
 
         Args:
             weight (torch.Tensor): The weight tensor with shape
                 (out_features, in_features).
-            qparams (Optional[QParams]): A namedtuple containing 'scales'
+            qparams (QParams | None): A namedtuple containing 'scales'
                 and 'zero_points'.
             real (bool): If True, return the tensor with quantized type.
 
diff --git a/lmdeploy/lite/utils/__init__.py b/lmdeploy/lite/utils/__init__.py
index 846964fb22..d801a1f447 100644
--- a/lmdeploy/lite/utils/__init__.py
+++ b/lmdeploy/lite/utils/__init__.py
@@ -1,9 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from .batch_split import concat_decoder_layer_outputs, split_decoder_layer_inputs
-from .cal_qparams import (QParams, cal_qparams_per_channel_absmax, cal_qparams_per_channel_minmax,
-                          cal_qparams_per_group_absmax, cal_qparams_per_group_minmax, cal_qparams_per_tensor_absmax,
-                          cal_qparams_per_tensor_minmax, precise_round)
+from .cal_qparams import (
+                          QParams,
+                          cal_qparams_per_channel_absmax,
+                          cal_qparams_per_channel_minmax,
+                          cal_qparams_per_group_absmax,
+                          cal_qparams_per_group_minmax,
+                          cal_qparams_per_tensor_absmax,
+                          cal_qparams_per_tensor_minmax,
+                          precise_round,
+)
 from .calib_dataloader import get_calib_loaders
 from .collect import bimap_name_mod, collect_target_modules, collect_target_weights
 from .global_avail import GlobalAvailMixin
diff --git a/lmdeploy/lite/utils/batch_split.py b/lmdeploy/lite/utils/batch_split.py
index 5390a7f7d9..f06efaee62 100644
--- a/lmdeploy/lite/utils/batch_split.py
+++ b/lmdeploy/lite/utils/batch_split.py
@@ -1,22 +1,22 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any
 
 import torch
 
 
-def split_decoder_layer_inputs(batch_size, *args: Union[torch.Tensor, Any],
-                               **kwargs: Union[torch.Tensor, Any]) -> Tuple[List[List[Any]], List[Dict[str, Any]]]:
+def split_decoder_layer_inputs(batch_size, *args: torch.Tensor | Any,
+                               **kwargs: torch.Tensor | Any) -> tuple[list[list[Any]], list[dict[str, Any]]]:
     """This function splits batched decoder layer inputs into individual
     elements.
 
     Args:
-        *args (Union[torch.Tensor, Any]): Positional arguments which could
+        *args (torch.Tensor | Any): Positional arguments which could
             be a mix of tensors and other types.
-        **kwargs (Union[torch.Tensor, Any]): Keyword arguments which could
+        **kwargs (torch.Tensor | Any): Keyword arguments which could
             be a mix of tensors and other types.
 
     Returns:
-        Tuple[List[List[Any]], List[Dict[str, Any]]]: A tuple containing two
+        tuple[list[list[Any]], list[dict[str, Any]]]: A tuple containing two
             lists, one for positional arguments, one for keyword arguments.
             Each list contains individual elements from the batch.
     """
@@ -46,7 +46,7 @@ def split_decoder_layer_inputs(batch_size, *args: Union[torch.Tensor, Any],
                 new_kwargs[name] = val[i:i + batch_size]
             elif isinstance(val, torch.Tensor) and len(val.shape) > 1 and val.size(1) == bs:  # qwen2-vl
                 new_kwargs[name] = val[:, i:i + batch_size]
-            elif name == 'position_embeddings' and isinstance(val, Tuple) and len(
+            elif name == 'position_embeddings' and isinstance(val, tuple) and len(
                     val[0].shape) > 1 and val[0].size(1) == bs:  # qwen2-vl
                 new_kwargs[name] = (val[0][:, i:i + batch_size], val[1][:, i:i + batch_size])
             else:
@@ -58,12 +58,12 @@ def split_decoder_layer_inputs(batch_size, *args: Union[torch.Tensor, Any],
     return batch_args, batch_kwargs
 
 
-def concat_decoder_layer_outputs(batch_outputs: List[Any]) -> Any:
+def concat_decoder_layer_outputs(batch_outputs: list[Any]) -> Any:
     """This function concatenates individual decoder layer outputs into a
     batched output.
 
     Args:
-        batch_outputs (List[Any]): A list, where each tuple
+        batch_outputs (list[Any]): A list, where each tuple
             represents the output from an individual element in the batch.
 
     Returns:
diff --git a/lmdeploy/lite/utils/cal_qparams.py b/lmdeploy/lite/utils/cal_qparams.py
index 38a21b8dd0..33326e13a1 100644
--- a/lmdeploy/lite/utils/cal_qparams.py
+++ b/lmdeploy/lite/utils/cal_qparams.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import NamedTuple, Optional
+from typing import NamedTuple
 
 import torch
 
@@ -8,7 +8,7 @@ class QParams(NamedTuple):
     """A class to hold the quantization parameters."""
 
     scales: torch.Tensor
-    zero_points: Optional[torch.Tensor]
+    zero_points: torch.Tensor | None
 
 
 @torch.no_grad()
diff --git a/lmdeploy/lite/utils/collect.py b/lmdeploy/lite/utils/collect.py
index 3351bfb5a0..8d421049c1 100644
--- a/lmdeploy/lite/utils/collect.py
+++ b/lmdeploy/lite/utils/collect.py
@@ -1,13 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Tuple, Union
 
 from torch import nn
 
 
 def collect_target_modules(model: nn.Module,
-                           target: Union[str, type],
-                           skip_names: List[str] = [],
-                           prefix: str = '') -> Dict[str, nn.Module]:
+                           target: str | type,
+                           skip_names: list[str] = [],
+                           prefix: str = '') -> dict[str, nn.Module]:
     """Collects the specific target modules from the model.
 
     Args:
@@ -38,7 +37,7 @@ def _is_target(n, m):
     return name2mod
 
 
-def collect_target_weights(model: nn.Module, target: Union[str, type], skip_names: List[str]) -> Dict[str, nn.Module]:
+def collect_target_weights(model: nn.Module, target: str | type, skip_names: list[str]) -> dict[str, nn.Module]:
     """Collects weights of the specific target modules from the model.
 
     Args:
@@ -61,7 +60,7 @@ def collect_target_weights(model: nn.Module, target: Union[str, type], skip_name
     return mod2weight
 
 
-def bimap_name_mod(name2mod_mappings: List[Dict[str, nn.Module]]) -> Tuple[Dict[str, nn.Module], Dict[nn.Module, str]]:
+def bimap_name_mod(name2mod_mappings: list[dict[str, nn.Module]]) -> tuple[dict[str, nn.Module], dict[nn.Module, str]]:
     """Generates bidirectional maps from module names to module instances and
     vice versa.
 
diff --git a/lmdeploy/lite/utils/global_avail.py b/lmdeploy/lite/utils/global_avail.py
index 3b608afa23..462125a676 100644
--- a/lmdeploy/lite/utils/global_avail.py
+++ b/lmdeploy/lite/utils/global_avail.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, Union
+from typing import Union
 
 from torch import nn
 
@@ -7,13 +7,13 @@
 class GlobalAvailMixin:
     """Mixin class to make instances globally available."""
 
-    _instances: Dict[str, Dict[Union[str, nn.Module], 'GlobalAvailMixin']] = {'default': {}}
+    _instances: dict[str, dict[str | nn.Module, 'GlobalAvailMixin']] = {'default': {}}
 
-    def global_available(self, key: Union[str, nn.Module] = 'default', group: str = 'default') -> None:
+    def global_available(self, key: str | nn.Module = 'default', group: str = 'default') -> None:
         """Make the instance globally available.
 
         Args:
-            key (Union[str, nn.Module], optional): Key to save the instance.
+            key (str | nn.Module, optional): Key to save the instance.
                 Defaults to 'default'.
             group (str, optional): Group to save the instance.
                 Defaults to 'default'.
@@ -23,13 +23,13 @@ def global_available(self, key: Union[str, nn.Module] = 'default', group: str =
     @classmethod
     def _save_instance(cls,
                        instance: 'GlobalAvailMixin',
-                       key: Union[str, nn.Module] = 'default',
+                       key: str | nn.Module = 'default',
                        group: str = 'default') -> None:
         """Save the instance.
 
         Args:
             instance (GlobalAvailMixin): Instance to save.
-            key (Union[str, nn.Module], optional): Key to save the instance.
+            key (str | nn.Module, optional): Key to save the instance.
                 Defaults to 'default'.
             group (str, optional): Group to save the instance.
                 Defaults to 'default'.
@@ -41,35 +41,35 @@ def _save_instance(cls,
         cls._instances[group][key] = instance
 
     @classmethod
-    def find(cls, key: Union[str, nn.Module] = 'default', group: str = 'default') -> Union[None, 'GlobalAvailMixin']:
+    def find(cls, key: str | nn.Module = 'default', group: str = 'default') -> Union[None, 'GlobalAvailMixin']:
         """Find an instance by its key and group.
 
         Args:
-            key (Union[str, nn.Module], optional): Key of the instance.
+            key (str | nn.Module, optional): Key of the instance.
                 Defaults to 'default'.
             group (str, optional): Group of the instance.
                 Defaults to 'default'.
 
         Returns:
-            Union[None, GlobalAvailMixin]: The found instance, or None if
+            None | GlobalAvailMixin: The found instance, or None if
                 it does not exist.
         """
         return cls._instances.get(group, {}).get(key)
 
     @classmethod
-    def find_group(cls, group: str) -> Dict[Union[str, nn.Module], 'GlobalAvailMixin']:
+    def find_group(cls, group: str) -> dict[str | nn.Module, 'GlobalAvailMixin']:
         """Find all instances in a group.
 
         Args:
             group (str): Group of the instances.
 
         Returns:
-            Dict[Union[str, nn.Module], GlobalAvailMixin]: All instances in
+            dict[str | nn.Module, GlobalAvailMixin]: All instances in
                 the group.
         """
         return cls._instances.get(group, {})
 
     @classmethod
-    def instances(cls) -> Dict[str, Dict[Union[str, nn.Module], 'GlobalAvailMixin']]:
+    def instances(cls) -> dict[str, dict[str | nn.Module, 'GlobalAvailMixin']]:
         """Get all instances."""
         return cls._instances
diff --git a/lmdeploy/lite/utils/memory_efficient.py b/lmdeploy/lite/utils/memory_efficient.py
index ae201f82e2..5c6431bcc4 100644
--- a/lmdeploy/lite/utils/memory_efficient.py
+++ b/lmdeploy/lite/utils/memory_efficient.py
@@ -4,7 +4,6 @@
 import warnings
 from contextlib import contextmanager
 from functools import partial
-from typing import List
 
 import torch
 from torch import nn
@@ -12,7 +11,7 @@
 from lmdeploy.lite.defaults import KV_CACHE_SIGNATURE, OFFLOAD_MOD
 
 
-def extract_return_values(module: nn.Module) -> List[str]:
+def extract_return_values(module: nn.Module) -> list[str]:
     """Extracts return values from given module's forward method.
 
     Args:
@@ -43,7 +42,7 @@ def find_kv_cache_idx(module: nn.Module) -> int:
     return signatures.index(KV_CACHE_SIGNATURE)
 
 
-def find_modules_by_return_value(model: nn.Module, value: str) -> List[nn.Module]:
+def find_modules_by_return_value(model: nn.Module, value: str) -> list[nn.Module]:
     """Finds modules in model that return given value.
 
     Args:
diff --git a/lmdeploy/logger.py b/lmdeploy/logger.py
index b52e586590..d758a69faa 100644
--- a/lmdeploy/logger.py
+++ b/lmdeploy/logger.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modify from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/logger.py  # noqa
-from typing import List, Optional
 
 from .messages import GenerationConfig
 from .utils import get_logger
@@ -13,11 +12,11 @@ class RequestLogger:
     exceed a specified maximum length.
 
     Args:
-        max_log_len (Optional[int]): The maximum length of the log entries.
+        max_log_len (int | None): The maximum length of the log entries.
             If None, no maximum length is enforced.
     """
 
-    def __init__(self, max_log_len: Optional[int]) -> None:
+    def __init__(self, max_log_len: int | None) -> None:
         self.max_log_len = max_log_len
 
     def log_prompt(self, session_id: int, prompt: str) -> None:
@@ -31,7 +30,7 @@ def log_prompt(self, session_id: int, prompt: str) -> None:
         logger.info(f'session={session_id}, '
                     f'prompt={prompt!r}')
 
-    def log_inputs(self, session_id: int, prompt: Optional[str], prompt_token_ids: Optional[List[int]],
+    def log_inputs(self, session_id: int, prompt: str | None, prompt_token_ids: list[int] | None,
                    gen_config: GenerationConfig, adapter_name: str) -> None:
         max_log_len = self.max_log_len
         input_tokens = len(prompt_token_ids)
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index b029d98c26..6dd34c8a6b 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -1,8 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import enum
 import time
+from collections.abc import Callable
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Literal, Optional
+from typing import Any, Literal
 
 import torch
 from pydantic.dataclasses import dataclass as pydantic_dataclass
@@ -50,10 +51,10 @@ class GenerationConfig:
         random_seed: Seed used when sampling a token
         stop_words: Words that stop generating further tokens
         bad_words: Words that the engine will never generate
-        stop_token_ids: List of tokens that stop the generation
+        stop_token_ids: list of tokens that stop the generation
             when they are generated. The returned output will not contain
             the stop tokens.
-        bad_token_ids: List of tokens that the engine will never
+        bad_token_ids: list of tokens that the engine will never
             generate.
         min_new_tokens: The minimum numbers of tokens to generate,
             ignoring the number of tokens in the prompt.
@@ -109,16 +110,16 @@ class GenerationConfig:
     repetition_penalty: float = 1.0
     ignore_eos: bool = False
     random_seed: int = None
-    stop_words: List[str] = None
-    bad_words: List[str] = None
-    stop_token_ids: List[int] = None
-    bad_token_ids: List[int] = None
+    stop_words: list[str] = None
+    bad_words: list[str] = None
+    stop_token_ids: list[int] = None
+    bad_token_ids: list[int] = None
     min_new_tokens: int = None
     skip_special_tokens: bool = True
     spaces_between_special_tokens: bool = True
     logprobs: int = None
-    response_format: Optional[Dict] = None
-    logits_processors: Optional[List[LogitsProcessor]] = None
+    response_format: dict | None = None
+    logits_processors: list[LogitsProcessor] | None = None
     output_logits: Literal['all', 'generation'] = None
     output_last_hidden_state: Literal['all', 'generation'] = None
     include_stop_str_in_output: bool = False
@@ -126,7 +127,7 @@ class GenerationConfig:
     # for disaggregation
     with_cache: bool = False
     preserve_cache: bool = False
-    migration_request: Optional[MigrationRequest] = None
+    migration_request: MigrationRequest | None = None
 
     # router replay
     return_routed_experts: bool = False
@@ -141,7 +142,7 @@ def convert_stop_bad_words_to_ids(self, tokenizer: Tokenizer):
 
         def special_word_token_ids(words):
             if words is not None:
-                assert isinstance(words, List) and \
+                assert isinstance(words, list) and \
                     all(isinstance(elem, str) for elem in words), \
                     f'stop_words must be a list of str but got {type(words)}'
                 indexes = []
@@ -178,7 +179,7 @@ def update_from_hf_gen_cfg(self, generation_config, tokenizer_eos_token_id):
 
     def __post_init__(self):
         """Check input validation."""
-        assert type(self.n) == int and self.n > 0, 'n is not a positive integer'
+        assert type(self.n) is int and self.n > 0, 'n is not a positive integer'
         assert self.top_p >= 0 and self.top_p <= 1  # [0, 1]
         assert self.top_k >= 0, 'top_k can not be a negative integer'
         assert self.temperature >= 0 and self.temperature <= 2  # [0,2]
@@ -251,7 +252,7 @@ class TurbomindEngineConfig:
     """
 
     dtype: str = 'auto'
-    model_format: Optional[str] = None
+    model_format: str | None = None
     tp: int = 1
     dp: int = 1
     cp: int = 1
@@ -264,9 +265,9 @@ class TurbomindEngineConfig:
     outer_dp_size: int = None
     nnodes: int = 1
     node_rank: int = 0
-    dist_init_addr: Optional[str] = None
-    devices: List[int] = None
-    session_len: Optional[int] = None
+    dist_init_addr: str | None = None
+    devices: list[int] = None
+    session_len: int | None = None
     max_batch_size: int = None
     cache_max_entry_count: float = 0.8
     cache_chunk_size: int = -1
@@ -275,16 +276,16 @@ class TurbomindEngineConfig:
     quant_policy: int = 0
     rope_scaling_factor: float = 0.0
     use_logn_attn: bool = False
-    download_dir: Optional[str] = None
-    revision: Optional[str] = None
+    download_dir: str | None = None
+    revision: str | None = None
     max_prefill_token_num: int = 8192
     num_tokens_per_iter: int = 0
     max_prefill_iters: int = 1
     async_: int = 1
-    devices: Optional[List[int]] = None
+    devices: list[int] | None = None
     empty_init: bool = False
     communicator: str = 'nccl'
-    hf_overrides: Optional[Dict[str, Any]] = None
+    hf_overrides: dict[str, Any] | None = None
     enable_metrics: bool = True
 
     def __post_init__(self):
@@ -388,13 +389,13 @@ class PytorchEngineConfig:
     block_size: int = 64
     num_cpu_blocks: int = 0
     num_gpu_blocks: int = 0
-    adapters: Dict[str, str] = None
+    adapters: dict[str, str] = None
     max_prefill_token_num: int = 4096
     thread_safe: bool = False
     enable_prefix_caching: bool = False
     device_type: str = 'cuda'
     eager_mode: bool = False
-    custom_module_map: Dict[str, str] = None
+    custom_module_map: dict[str, str] = None
     download_dir: str = None
     revision: str = None
     quant_policy: Literal[0, 4, 8] = 0
@@ -406,7 +407,7 @@ class PytorchEngineConfig:
     mp_engine_backend: str = 'mp'
     model_format: str = None
     enable_metrics: bool = True
-    hf_overrides: Optional[Dict[str, Any]] = None
+    hf_overrides: dict[str, Any] | None = None
     disable_vision_encoder: bool = False
     logprobs_mode: str = None
     # router replay
@@ -474,23 +475,20 @@ class Response:
         generate_token_len: the response token length.
         input_token_len: the input prompt token length. Note that it may
             contains chat template part.
-        session_id: the id for running the session.
         finish_reason: the reason the model stopped
             generating tokens. This will be 'stop' if the model hit a natural
             stop point or a provided stop sequence, 'length' if the maximum
             number of tokens specified in the request was reached.
-        token_ids:: the output token ids.
-        logprobs:: the top logprobs for each output
-            position.
-        index: it refers to the position index of the input request
-            batch
+        token_ids: the output token ids.
+        logprobs: the top logprobs for each output position.
+        index: it refers to the position index of the input request batch.
     """
     text: str
     generate_token_len: int
     input_token_len: int
-    finish_reason: Optional[Literal['stop', 'length']] = None
-    token_ids: List[int] = field(default_factory=list)
-    logprobs: List[Dict[int, float]] = None
+    finish_reason: Literal['stop', 'length'] | None = None
+    token_ids: list[int] = field(default_factory=list)
+    logprobs: list[dict[int, float]] = None
     logits: torch.Tensor = None
     last_hidden_state: torch.Tensor = None
     index: int = 0
@@ -511,7 +509,7 @@ def _format_none_text_fields(self):
         fields.append(f'logprobs={self.logprobs}')
 
         # Helper function to format tensor information
-        def _format_tensor(name: str, tensor: Optional[torch.Tensor]) -> List[str]:
+        def _format_tensor(name: str, tensor: torch.Tensor | None) -> list[str]:
             if tensor is None:
                 return [f'{name}=None']
             try:
@@ -580,7 +578,7 @@ class EngineEvent:
     timestamp: float
 
     @classmethod
-    def new_event(cls, event_type: EventType, timestamp: Optional[float] = None) -> 'EngineEvent':
+    def new_event(cls, event_type: EventType, timestamp: float | None = None) -> 'EngineEvent':
         # Timestamps MUST use wall-clock time (time.time()) to maintain consistency
         # between csrc(std::chrono::system_clock) and python
         timestamp = time.time() if timestamp is None else timestamp
@@ -604,11 +602,11 @@ class RequestMetrics:
 
     Attributes:
         token_timestamp: A wall-clock time when a token is generated.
-        engine_events: List of engine events during inference.
+        engine_events: list of engine events during inference.
     """
     token_timestamp: float = 0.0
-    engine_events: List[EngineEvent] = field(default_factory=list)
-    spec_info: Optional[Dict[str, Any]] = None
+    engine_events: list[EngineEvent] = field(default_factory=list)
+    spec_info: dict[str, Any] | None = None
 
 
 @dataclass
@@ -625,12 +623,12 @@ class EngineOutput:
         req_metrics: request metrics information
     """
     status: ResponseType
-    token_ids: List[int]
-    logprobs: List[Dict[int, float]] = None
+    token_ids: list[int]
+    logprobs: list[dict[int, float]] = None
     logits: torch.Tensor = None
     last_hidden_state: torch.Tensor = None
-    cache_block_ids: Optional[List[int]] = None
-    req_metrics: Optional[RequestMetrics] = None
+    cache_block_ids: list[int] | None = None
+    req_metrics: RequestMetrics | None = None
     routed_experts: torch.Tensor = None
 
 
diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py
index ac63cab1c6..29411c7cd5 100644
--- a/lmdeploy/metrics/loggers.py
+++ b/lmdeploy/metrics/loggers.py
@@ -4,7 +4,6 @@
 import time
 from abc import ABC, abstractmethod
 from datetime import datetime
-from typing import List
 
 import numpy as np
 
@@ -346,11 +345,11 @@ def record_specdecode(self, stats: SpeculativeDecodingStats) -> None:
         pass
 
 
-def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
+def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]:
     """Builds a list of buckets with increasing powers of 10 multiplied by
     mantissa values until the value exceeds the specified maximum."""
     exponent = 0
-    buckets: List[int] = []
+    buckets: list[int] = []
     while True:
         for m in mantissa_lst:
             value = m * 10**exponent
@@ -361,7 +360,7 @@ def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]:
         exponent += 1
 
 
-def build_1_2_5_buckets(max_value: int) -> List[int]:
+def build_1_2_5_buckets(max_value: int) -> list[int]:
     """
     Example:
     >>> build_1_2_5_buckets(100)
diff --git a/lmdeploy/metrics/metrics_processor.py b/lmdeploy/metrics/metrics_processor.py
index dd8eaeb0c6..9059ca4a1d 100644
--- a/lmdeploy/metrics/metrics_processor.py
+++ b/lmdeploy/metrics/metrics_processor.py
@@ -11,7 +11,7 @@
 
 
 @singleton
-class MetricsProcessor():
+class MetricsProcessor:
     """Metrics processor."""
 
     def __init__(self):
diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py
index 2b9367a94d..bd98bb14a0 100644
--- a/lmdeploy/metrics/stats.py
+++ b/lmdeploy/metrics/stats.py
@@ -3,7 +3,6 @@
 
 import time
 from dataclasses import dataclass
-from typing import List, Optional
 
 import numpy as np
 
@@ -108,7 +107,7 @@ def __repr__(self):
                 f'  latest_token_time={self.lastest_token_time:.6f},\n'
                 ')')
 
-    def update_from_events(self, engine_events: List[EngineEvent]):
+    def update_from_events(self, engine_events: list[EngineEvent]):
         # avoid circular dependency
         from lmdeploy.messages import EventType
 
@@ -174,9 +173,9 @@ def __init__(self):
         self.iteration_timestamp = time.time()
         self.new_generation_tokens = 0
         self.prompt_tokens = 0
-        self.ttft: Optional[float] = None
-        self.tpot: Optional[float] = None
-        self.itl: Optional[float] = None
+        self.ttft: float | None = None
+        self.tpot: float | None = None
+        self.itl: float | None = None
 
     def __repr__(self):
         return ('IterationStats(\n'
diff --git a/lmdeploy/model.py b/lmdeploy/model.py
index 981e4b80b1..dbd8939ecf 100644
--- a/lmdeploy/model.py
+++ b/lmdeploy/model.py
@@ -2,7 +2,7 @@
 import dataclasses
 import json
 import uuid
-from typing import List, Literal, Optional, Union
+from typing import Literal
 
 from mmengine import Registry
 
@@ -18,7 +18,7 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-def get_text(content: Union[str, List[dict]]):
+def get_text(content: str | list[dict]):
     """Within the OpenAI API, the content field may be specified as either a
     string or a list of ChatCompletionContentPartTextParam (defined in openai).
 
@@ -36,34 +36,37 @@ class ChatTemplateConfig:
     """Parameters for chat template.
 
     Args:
-        model_name (str): the name of the deployed model. Determine which chat template will be applied.
-            All the chat template names: `lmdeploy list`
-        system (str | None): begin of the system prompt
-        meta_instruction (str | None): system prompt
-        eosys (str | None): end of the system prompt
-        user (str | None): begin of the user prompt
-        eoh (str | None): end of the user prompt
-        assistant (str | None): begin of the assistant prompt
-        eoa (str | None): end of the assistant prompt
-        tool (str | None): begin of the tool prompt
-        eotool (str | None): end of the tool prompt
-        capability: ('completion' | 'infilling' | 'chat' | 'python') = None
-    """  # noqa: E501
+        model_name: the name of the deployed model. Determine which chat template will be applied.
+            All the chat template names: ``lmdeploy list``
+        system: begin of the system prompt.
+        meta_instruction: system prompt.
+        eosys: end of the system prompt.
+        user: begin of the user prompt.
+        eoh: end of the user prompt.
+        assistant: begin of the assistant prompt.
+        eoa: end of the assistant prompt.
+        tool: begin of the tool prompt.
+        eotool: end of the tool prompt.
+        capability: the capability of the model, one of
+            ``'completion'``, ``'infilling'``, ``'chat'``, ``'python'``.
+            Default to None.
+        stop_words: list of stop words. Default to None.
+    """
 
     model_name: str
-    model_path: Optional[str] = None
-    system: Optional[str] = None
-    meta_instruction: Optional[str] = None
-    eosys: Optional[str] = None
-    user: Optional[str] = None
-    eoh: Optional[str] = None
-    assistant: Optional[str] = None
-    eoa: Optional[str] = None
-    tool: Optional[str] = None
-    eotool: Optional[str] = None
-    separator: Optional[str] = None
-    capability: Optional[Literal['completion', 'infilling', 'chat', 'python']] = None
-    stop_words: Optional[List[str]] = None
+    model_path: str | None = None
+    system: str | None = None
+    meta_instruction: str | None = None
+    eosys: str | None = None
+    user: str | None = None
+    eoh: str | None = None
+    assistant: str | None = None
+    eoa: str | None = None
+    tool: str | None = None
+    eotool: str | None = None
+    separator: str | None = None
+    capability: Literal['completion', 'infilling', 'chat', 'python'] | None = None
+    stop_words: list[str] | None = None
 
     @property
     def chat_template(self):
@@ -91,12 +94,12 @@ def from_json(cls, file_or_string):
         """Construct a dataclass instance from a JSON file or JSON string."""
         try:
             # Try to open the input_data as a file path
-            with open(file_or_string, 'r', encoding='utf-8') as file:
+            with open(file_or_string, encoding='utf-8') as file:
                 json_data = file.read()
         except FileNotFoundError:
             # If it's not a file path, assume it's a JSON string
             json_data = file_or_string
-        except IOError:
+        except OSError:
             # If it's not a file path and not a valid JSON string, raise error
             raise ValueError('Invalid input. Must be a file path or a valid JSON string.')
         json_data = json.loads(json_data)
@@ -169,7 +172,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
         chat template.
 
         Args:
-            messages (str | List): user's input prompt
+            messages (str | list): user's input prompt
         Returns:
             str: the concatenated prompt
         """
@@ -191,7 +194,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
         return ret
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -225,7 +228,7 @@ def __init__(self,
                          **kwargs)
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -270,7 +273,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
         return super().messages2prompt(messages, sequence_start, **kwargs)[:-1]
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -294,7 +297,7 @@ def __init__(
         super().__init__(meta_instruction=meta_instruction, **kwargs)
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -339,7 +342,7 @@ def __init__(
                          **kwargs)
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -360,7 +363,7 @@ def __init__(self, user='<reserved_106>', assistant='<reserved_107>', **kwargs):
         super().__init__(user=user, assistant=assistant, **kwargs)
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -398,7 +401,7 @@ def __init__(
                          **kwargs)
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -443,7 +446,7 @@ def _infill_prompt(self, prompt):
         return prompt
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -494,7 +497,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
         return ret
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -517,7 +520,7 @@ def __init__(self, user='[INST] ', eoh=' [/INST]', eoa='</s>', **kwargs):
         super().__init__(user=user, eoh=eoh, eoa=eoa, **kwargs)
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -548,7 +551,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
         return super().messages2prompt(messages, sequence_start, **kwargs)[:-1]
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -590,7 +593,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
         return super().messages2prompt(messages, sequence_start, **kwargs)[:-1]
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -629,7 +632,7 @@ def messages2prompt(self, messages, sequence_start=True, **kwargs):
         return super().messages2prompt(messages, sequence_start, **kwargs)[:-1]
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -664,7 +667,7 @@ def __init__(self,
                          **kwargs)
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         """Return the model_name that was registered to MODELS.
 
         Args:
@@ -787,7 +790,7 @@ def _system_instruction(self):
             return None, None, [], self.tokenizer.bos_token or ''
 
     @classmethod
-    def match(cls, model_path: str) -> Optional[str]:
+    def match(cls, model_path: str) -> str | None:
         try:
             cls(model_path)
         except Exception:
@@ -795,12 +798,12 @@ def match(cls, model_path: str) -> Optional[str]:
         return True
 
 
-def get_chat_template(model_path: str, config: Optional[ChatTemplateConfig] = None) -> BaseChatTemplate:
+def get_chat_template(model_path: str, config: ChatTemplateConfig | None = None) -> BaseChatTemplate:
     """Get the chat template for the model.
 
     Args:
         model_path (str): the model path.
-        config (Optional[ChatTemplateConfig]): the chat template config.
+        config (ChatTemplateConfig | None): the chat template config.
     Returns:
         BaseChatTemplate: the chat template.
     """
diff --git a/lmdeploy/pipeline.py b/lmdeploy/pipeline.py
index ab1902e134..40238f5653 100644
--- a/lmdeploy/pipeline.py
+++ b/lmdeploy/pipeline.py
@@ -1,13 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import annotations
+
 import asyncio
 import atexit
 import concurrent.futures
 import os
+from collections.abc import Iterator
 from contextlib import closing
 from functools import partial
 from queue import Queue
 from threading import Thread
-from typing import TYPE_CHECKING, Dict, Iterator, List, Tuple
+from typing import TYPE_CHECKING
 
 import torch
 import tqdm
@@ -81,8 +84,8 @@ def __init__(self,
         self.async_engine.start_loop(self.internal_thread.loop, use_async_api=False)
 
     def infer(self,
-              prompts: List[str] | str | List[Dict] | List[List[Dict]] | Tuple | List[Tuple],
-              gen_config: GenerationConfig | List[GenerationConfig] | None = None,
+              prompts: list[str] | str | list[dict] | list[list[dict]] | tuple | list[tuple],
+              gen_config: GenerationConfig | list[GenerationConfig] | None = None,
               do_preprocess: bool = True,
               adapter_name: str | None = None,
               use_tqdm: bool = False,
@@ -90,13 +93,16 @@ def infer(self,
         """Inference prompts.
 
         Args:
-            prompts: Prompts to inference. It can be a single prompt, a list of prompts, a list of tuples, or a tuple.
-                Tuple can be (prompt, image or [images]) or (image or [images], prompt).
-            gen_config(GenerationConfig | List[GenerationConfig] | None): Generation configuration(s).
-            do_preprocess(bool): Whether to pre-process messages.
-            adapter_name(str | None): Adapter name.
-            use_tqdm(bool): Whether to use progress bar.
-            **kwargs(dict): Additional keyword arguments.
+            prompts: Prompts for inference. It can be a single prompt, a list of prompts, a list of tuples, or a tuple.
+                tuple can be (prompt, image or [images]) or (image or [images], prompt).
+            gen_config: Generation configuration(s).
+            do_preprocess: Whether to pre-process messages.
+            adapter_name: Adapter name.
+            use_tqdm: Whether to use progress bar.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            Response | list[Response]: A single response or a list of responses.
         """
         is_single = self._is_single(prompts)
         # format prompts to openai message format, which is a list of dicts
@@ -126,9 +132,9 @@ def batch_infer(self, *args, **kwargs):
         return self.infer(*args, **kwargs)
 
     def stream_infer(self,
-                     prompts: List[str] | str | List[Dict] | List[List[Dict]] | Tuple | List[Tuple],
-                     sessions: 'Session' | List['Session'] | None = None,
-                     gen_config: GenerationConfig | List[GenerationConfig] | None = None,
+                     prompts: list[str] | str | list[dict] | list[list[dict]] | tuple | list[tuple],
+                     sessions: Session | list[Session] | None = None,
+                     gen_config: GenerationConfig | list[GenerationConfig] | None = None,
                      do_preprocess: bool = True,
                      adapter_name: str | None = None,
                      stream_response: bool = True,
@@ -136,20 +142,19 @@ def stream_infer(self,
         """Stream inference.
 
         Args:
-            prompts(List[str] | str | List[Dict] | List[List[Dict]] | Tuple | List[Tuple]): Prompts to inference.
-                It can be a single prompt, a list of prompts, a list of tuples, or a tuple.
-                Tuple can be (prompt, image or [images]) or (image or [images], prompt).
-            sessions(Session | List[Session] | None): Sessions. Each of which corresponds to a prompt.
-            gen_config(GenerationConfig | List[GenerationConfig] | None): Generation configuration(s).
-            do_preprocess(bool): Whether to pre-process messages.
-            adapter_name(str | None): Adapter name.
-            stream_response(bool): Whether to stream the response. If True, the generator will stream the response.
+            prompts: Prompts to inference. It can be a single prompt, a list of prompts, a list of tuples, or a tuple.
+                tuple can be (prompt, image or [images]) or (image or [images], prompt).
+            sessions: Sessions. Each of which corresponds to a prompt.
+            gen_config: Generation configuration(s).
+            do_preprocess: Whether to pre-process messages.
+            adapter_name: Adapter name.
+            stream_response: Whether to stream the response. If True, the generator will stream the response.
                 Otherwise, the generator will run until finish and return the final response. This argument
                 is introduced to support the streaming and non-streaming modes of Pipeline.chat.
-            **kwargs(dict): Additional keyword arguments.
+            **kwargs: Additional keyword arguments.
 
         Returns:
-            Generator: A generator that yields the output (i.e. instance of class `Response`) of the inference.
+            Iterator: A generator that yields the output (i.e. instance of class ``Response``) of the inference.
         """
         prompts = MultimodalProcessor.format_prompts(prompts)
         requests = self._request_generator(prompts,
@@ -167,22 +172,24 @@ def close(self):
         self.async_engine.close()
 
     def chat(self,
-             prompt: str | Tuple[str, 'Image' | List['Image']],
+             prompt: str | tuple[str, Image | list[Image]],
              session=None,
              gen_config: GenerationConfig | None = None,
              stream_response=False,
              adapter_name=None,
-             **kwargs) -> 'Session' | Iterator:
+             **kwargs) -> Session | Iterator:
         """Chat.
 
         Args:
-            prompt (str): prompt
-            session (Session): the chat session
-            gen_config (GenerationConfig | None): a instance of
-                GenerationConfig. Default to None.
-            stream_response (bool): whether to stream the response.
-            adapter_name (str): adapter name.
-            **kwargs (dict): additional keyword arguments.
+            prompt: prompt string or a tuple of (prompt, image or [images]).
+            session: the chat session.
+            gen_config: an instance of GenerationConfig. Default to None.
+            stream_response: whether to stream the response.
+            adapter_name: adapter name.
+            **kwargs: additional keyword arguments.
+
+        Returns:
+            Session | Iterator: the updated session, or a streaming iterator if stream_response is True.
         """
         if session is None:
             session = self.session_mgr.get()
@@ -227,25 +234,26 @@ def _gen():
 
         return session
 
-    def session(self) -> 'Session':
+    def session(self) -> Session:
         """Create a new session."""
         return self.session_mgr.get()
 
-    def get_reward_score(self, input_ids: List) -> List[float]:
+    def get_reward_score(self, input_ids: list) -> list[float]:
         """Get reward score.
 
         Args:
-            input_ids(List): a list of token_id or a list of token_id list or token_id tensor
-        Return:
-            reward score in a list. If the input_ids is a list of token_id, the return value
-            is still a list with length 1.
+            input_ids: a list of token_id or a list of token_id list or token_id tensor.
+
+        Returns:
+            list[float]: reward score in a list. If the input_ids is a list of token_id,
+                the return value is still a list with length 1.
         """
         supported_reward_models = ['InternLM2ForRewardModel', 'Qwen2ForRewardModel']
         arch = self.async_engine.arch
         if arch not in supported_reward_models:
             raise ValueError(f'{arch} is not in reward model list: {supported_reward_models}')
-        assert isinstance(input_ids, List)
-        assert all(isinstance(x, int) for x in input_ids) or all(isinstance(x, List) for x in input_ids)
+        assert isinstance(input_ids, list)
+        assert all(isinstance(x, int) for x in input_ids) or all(isinstance(x, list) for x in input_ids)
         # Make input_ids a list of token_id list
         input_ids = [input_ids] if isinstance(input_ids[0], int) else input_ids
         logits = self._run(coro=self.async_engine.async_get_logits(input_ids=input_ids)).result()
@@ -253,17 +261,17 @@ def get_reward_score(self, input_ids: List) -> List[float]:
         scores = [x[-1].cpu().item() for x in logits]
         return scores
 
-    def get_ppl(self, input_ids: List[int] | List[List[int]]) -> List[float]:
+    def get_ppl(self, input_ids: list[int] | list[list[int]]) -> list[float]:
         """Get perplexity scores given a list of input tokens that have to be
         of the same length.
 
         Args:
-            input_ids (List[int] | List[List[int]]): the batch of input token ids
+            input_ids: the batch of input token ids.
 
         Returns:
-            List[float]: A list of perplexity scores.
+            list[float]: A list of perplexity scores.
         """
-        assert isinstance(input_ids, List)
+        assert isinstance(input_ids, list)
         if isinstance(input_ids[0], int):
             input_ids = [input_ids]
         assert all(len(_) > 1 for _ in input_ids)
@@ -304,8 +312,8 @@ def get_ppl(self, input_ids: List[int] | List[List[int]]) -> List[float]:
         return output
 
     def __call__(self,
-                 prompts: List[str] | str | List[Dict] | List[List[Dict]],
-                 gen_config: GenerationConfig | List[GenerationConfig] | None = None,
+                 prompts: list[str] | str | list[dict] | list[list[dict]],
+                 gen_config: GenerationConfig | list[GenerationConfig] | None = None,
                  **kwargs):
         return self.infer(prompts, gen_config=gen_config, **kwargs)
 
@@ -328,12 +336,12 @@ async def generate(self, *args, **kwargs):
     def _is_single(prompts):
         """Check if prompts is a single prompt."""
         return (isinstance(prompts, str) or (isinstance(prompts, tuple) and len(prompts) == 2)
-                or (isinstance(prompts, list) and len(prompts) > 0 and isinstance(prompts[0], Dict)))
+                or (isinstance(prompts, list) and len(prompts) > 0 and isinstance(prompts[0], dict)))
 
     def _request_generator(self,
-                           prompts: List[str] | str | List[Dict] | List[List[Dict]],
-                           sessions: List['Session'] | 'Session' | None = None,
-                           gen_config: GenerationConfig | List[GenerationConfig] | None = None,
+                           prompts: list[str] | str | list[dict] | list[list[dict]],
+                           sessions: list[Session] | Session | None = None,
+                           gen_config: GenerationConfig | list[GenerationConfig] | None = None,
                            **kwargs):
         """Generate requests."""
         is_single = self._is_single(prompts)
@@ -372,7 +380,7 @@ def _get_limiter(self):
             self.limiter = asyncio.Semaphore(self.backend_config.max_batch_size)
         return self.limiter
 
-    def _infer(self, requests: Iterator[Dict], multiplex: bool, pbar=None, loop=None) -> Iterator[Iterator[Response]]:
+    def _infer(self, requests: Iterator[dict], multiplex: bool, pbar=None, loop=None) -> Iterator[Iterator[Response]]:
 
         async def _sync_resp(g, que: Queue, idx: int, sem: asyncio.Semaphore):
             async for out in g:
@@ -470,13 +478,13 @@ def _get_long_text_ppl(self, session, input_ids, max_input_len):
         return loss_sum / target_count
 
     def _get_ppl(self,
-                 sessions: List['Session'],
-                 input_ids: List[List[int]],
+                 sessions: list[Session],
+                 input_ids: list[list[int]],
                  max_input_len: int,
                  target_ids=None,
                  sequence_start: bool = True,
                  sequence_end: bool = True):
-        assert (isinstance(input_ids, List) and all(isinstance(_, List) for _ in input_ids))
+        assert (isinstance(input_ids, list) and all(isinstance(_, list) for _ in input_ids))
         assert target_ids is None or len(target_ids) == len(input_ids)
         assert len(sessions) == len(input_ids)
 
diff --git a/lmdeploy/profiler.py b/lmdeploy/profiler.py
index 51895d8ddf..d687d5a485 100644
--- a/lmdeploy/profiler.py
+++ b/lmdeploy/profiler.py
@@ -2,7 +2,6 @@
 import csv
 import os
 import time
-from typing import List
 
 import numpy as np
 
@@ -30,8 +29,8 @@ def finish(self, status):
 
 class Profiler:
 
-    def __init__(self, stream_output: bool, percentages: List[int]):
-        self.sessions: List[Session] = []
+    def __init__(self, stream_output: bool, percentages: list[int]):
+        self.sessions: list[Session] = []
         self.stream_output = stream_output
         self.percentages = percentages
 
@@ -47,11 +46,11 @@ def finish(self):
         self.elapsed_time = time.perf_counter() - self.t_start
 
     def compute_metrics(self):
-        self.ttfts: List[float] = []
-        self.tpots: List[float] = []
-        self.e2es: List[float] = []
-        self.itls: List[float] = []
-        self.tpts: List[int] = []
+        self.ttfts: list[float] = []
+        self.tpots: list[float] = []
+        self.e2es: list[float] = []
+        self.itls: list[float] = []
+        self.tpts: list[int] = []
         self.total_output = 0
         self.total_input = 0
         self.success = 0
@@ -103,7 +102,7 @@ def compute_metrics(self):
 
         self.rps = self.success / self.elapsed_time
 
-    def summarize(self, title: str, hyperparams: List = None, header=40, digits=10):
+    def summarize(self, title: str, hyperparams: list = None, header=40, digits=10):
 
         width = header + digits * (1 + len(self.percentages))
 
diff --git a/lmdeploy/pytorch/adapter/adapter.py b/lmdeploy/pytorch/adapter/adapter.py
index 8eb102ba9a..ca93b5fc89 100644
--- a/lmdeploy/pytorch/adapter/adapter.py
+++ b/lmdeploy/pytorch/adapter/adapter.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import re
-from typing import Dict, Iterable, List, Tuple
+from collections.abc import Iterable
 
 import torch
 from torch import nn
@@ -70,7 +70,7 @@ def _get_reverse_pack_map(model: nn.Module):
     return reverse_map
 
 
-def _get_key_map(reverse_map: Dict[str, str]):
+def _get_key_map(reverse_map: dict[str, str]):
     """Get key map."""
     key_map = dict()
     for name, pack_name in reverse_map.items():
@@ -81,7 +81,7 @@ def _get_key_map(reverse_map: Dict[str, str]):
     return key_map
 
 
-def load_lora_weights(model: nn.Module, weights: Iterable[Tuple[str, torch.Tensor]], adapter_id: int):
+def load_lora_weights(model: nn.Module, weights: Iterable[tuple[str, torch.Tensor]], adapter_id: int):
     """Load lora weights."""
     from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
     prefix_len = len('base_model.model.')
@@ -111,7 +111,7 @@ def load_lora_weights(model: nn.Module, weights: Iterable[Tuple[str, torch.Tenso
 class AdapterManager:
     """Adapter manager."""
 
-    def __init__(self, adapters: Dict[str, str]):
+    def __init__(self, adapters: dict[str, str]):
         if adapters is None:
             adapters = dict()
 
@@ -122,7 +122,7 @@ def __init__(self, adapters: Dict[str, str]):
         adapter_id_map = dict(zip(adapter_names, range(len(adapter_names))))
         self.adapter_id_map = adapter_id_map
 
-    def get_adapter_ids(self, names: List[str]):
+    def get_adapter_ids(self, names: list[str]):
         return [self.adapter_id_map[name] for name in names]
 
     def num_adapters(self):
diff --git a/lmdeploy/pytorch/backends/awq_modules.py b/lmdeploy/pytorch/backends/awq_modules.py
index 1a9815c423..02bdcb0069 100644
--- a/lmdeploy/pytorch/backends/awq_modules.py
+++ b/lmdeploy/pytorch/backends/awq_modules.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 
@@ -12,7 +11,7 @@ def update_weights(self,
                        qweight: torch.Tensor,
                        scales: torch.Tensor,
                        qzeros: torch.Tensor,
-                       bias: Optional[torch.Tensor] = None):
+                       bias: torch.Tensor | None = None):
         """Update weights."""
         return qweight, scales, qzeros, bias
 
@@ -20,9 +19,9 @@ def update_weights(self,
     def forward(self,
                 x,
                 weight: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
-                group: Optional[torch.distributed.ProcessGroup] = None):
+                group: torch.distributed.ProcessGroup | None = None):
         """forward."""
         raise NotImplementedError
 
diff --git a/lmdeploy/pytorch/backends/base.py b/lmdeploy/pytorch/backends/base.py
index 603448c6c8..590727ff53 100644
--- a/lmdeploy/pytorch/backends/base.py
+++ b/lmdeploy/pytorch/backends/base.py
@@ -3,7 +3,6 @@
 # https://github.com/vllm-project/vllm/blob/main/vllm/attention/backends/abstract.py
 from abc import ABC, abstractmethod
 from enum import Enum, auto
-from typing import Tuple
 
 import torch
 
@@ -70,7 +69,7 @@ def get_k_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         """Get block shape of k."""
         raise NotImplementedError
 
@@ -81,7 +80,7 @@ def get_v_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         """Get block shape of v."""
         raise NotImplementedError
 
diff --git a/lmdeploy/pytorch/backends/blockedf8_modules.py b/lmdeploy/pytorch/backends/blockedf8_modules.py
index 0d7a5e422a..dd3f360f80 100644
--- a/lmdeploy/pytorch/backends/blockedf8_modules.py
+++ b/lmdeploy/pytorch/backends/blockedf8_modules.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
-from typing import List, Optional
 
 import torch
 import torch.distributed as dist
@@ -10,13 +9,13 @@ class LinearBlockedF8Impl(ABC):
     """Linear BlockedF8 implementation api."""
 
     def __init__(self):
-        self.scale_fmt: Optional[str] = None
+        self.scale_fmt: str | None = None
 
-    def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: Optional[torch.Tensor] = None):
+    def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: torch.Tensor | None = None):
         """Update weights."""
         return weight, scale, bias
 
-    def set_scale_fmt(self, scale_fmt: Optional[str]):
+    def set_scale_fmt(self, scale_fmt: str | None):
         """Set scale fmt."""
         self.scale_fmt = scale_fmt
 
@@ -25,11 +24,11 @@ def forward(self,
                 x,
                 weight: torch.Tensor,
                 scale: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
-                group: Optional[dist.ProcessGroup] = None,
+                group: dist.ProcessGroup | None = None,
                 rank: int = 0,
-                scatter_size: List[int] = None):
+                scatter_size: list[int] = None):
         """forward."""
         raise NotImplementedError
 
diff --git a/lmdeploy/pytorch/backends/cuda/attention/default.py b/lmdeploy/pytorch/backends/cuda/attention/default.py
index aca4510e5b..26886b5ce5 100644
--- a/lmdeploy/pytorch/backends/cuda/attention/default.py
+++ b/lmdeploy/pytorch/backends/cuda/attention/default.py
@@ -98,8 +98,12 @@ def __init__(
         self.logit_softcapping = -1 if self.logit_softcapping <= 0.0 else self.logit_softcapping
         assert not (alibi and not causal)
 
-        from lmdeploy.pytorch.kernels.cuda import (fill_kv_cache, flash_attn_varlen_func, flash_attn_with_kvcache,
-                                                   flatten_kv_cache)
+        from lmdeploy.pytorch.kernels.cuda import (
+            fill_kv_cache,
+            flash_attn_varlen_func,
+            flash_attn_with_kvcache,
+            flatten_kv_cache,
+        )
 
         self.fill_kv_cache = fill_kv_cache
         self.paged_attention_fwd = flash_attn_with_kvcache
diff --git a/lmdeploy/pytorch/backends/cuda/attention/mla.py b/lmdeploy/pytorch/backends/cuda/attention/mla.py
index 624e8f169f..6381e843ef 100644
--- a/lmdeploy/pytorch/backends/cuda/attention/mla.py
+++ b/lmdeploy/pytorch/backends/cuda/attention/mla.py
@@ -74,7 +74,7 @@ def update_prefill(self, nsa_indices: torch.Tensor, q_seqlens: torch.Tensor, cu_
         return self._update_prefill_func(nsa_indices, q_seqlens, cu_seqlens_k)
 
     @staticmethod
-    @functools.lru_cache(maxsize=None)
+    @functools.cache
     def build():
         return NSAIndicesUpdater()
 
diff --git a/lmdeploy/pytorch/backends/cuda/awq_modules.py b/lmdeploy/pytorch/backends/cuda/awq_modules.py
index 01516a8aca..667ff77f6c 100644
--- a/lmdeploy/pytorch/backends/cuda/awq_modules.py
+++ b/lmdeploy/pytorch/backends/cuda/awq_modules.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
 
 import torch
 
@@ -54,9 +53,9 @@ def forward(self,
                 qweight: torch.Tensor,
                 scales: torch.Tensor,
                 qzeros: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
-                group: Optional[torch.distributed.ProcessGroup] = None):
+                group: torch.distributed.ProcessGroup | None = None):
         """forward."""
         out_features = scales.size(1)
         out = wq_gemm_forward(x, qweight, qzeros, scales, self.w_bit, self.group_size, bias, out_features)
diff --git a/lmdeploy/pytorch/backends/cuda/blockedf8_modules.py b/lmdeploy/pytorch/backends/cuda/blockedf8_modules.py
index ed8715edca..463d26bb69 100644
--- a/lmdeploy/pytorch/backends/cuda/blockedf8_modules.py
+++ b/lmdeploy/pytorch/backends/cuda/blockedf8_modules.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Optional
 
 import torch
 
@@ -27,11 +26,11 @@ def forward(self,
                 x,
                 weight: torch.Tensor,
                 scale: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
-                group: Optional[dist.ProcessGroup] = None,
+                group: dist.ProcessGroup | None = None,
                 rank: int = 0,
-                scatter_size: List[int] = None):
+                scatter_size: list[int] = None):
         """forward."""
         x_shape = x.shape
         x = x.flatten(0, -2)
@@ -113,11 +112,11 @@ def forward(self,
                 x,
                 weight: torch.Tensor,
                 scale: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
-                group: Optional[dist.ProcessGroup] = None,
+                group: dist.ProcessGroup | None = None,
                 rank: int = 0,
-                scatter_size: List[int] = None):
+                scatter_size: list[int] = None):
         """forward."""
         x_shape = x.shape
         x = x.flatten(0, -2)
diff --git a/lmdeploy/pytorch/backends/cuda/graph_runner.py b/lmdeploy/pytorch/backends/cuda/graph_runner.py
index 58d093cf9b..9e11444887 100644
--- a/lmdeploy/pytorch/backends/cuda/graph_runner.py
+++ b/lmdeploy/pytorch/backends/cuda/graph_runner.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import functools
-from typing import Any, Dict, List, Tuple
+from typing import Any
 
 import torch
 from torch.profiler import record_function
@@ -66,7 +66,7 @@ def __init__(
         max_tokens: int,
         num_blocks: int,
         is_decoding: bool,
-        pool: Tuple[int, int],
+        pool: tuple[int, int],
         model_config: ModelConfig,
         device: torch.device,
         decode_query_len: int = 1,
@@ -153,7 +153,7 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig, cache_conf
         self.enable_graph = self.check_enable_graph()
 
         self.graph_pool_handle = torch.cuda.graph_pool_handle()
-        self._runner_map: Dict[Any, CUDASingleGraphRunner] = dict()
+        self._runner_map: dict[Any, CUDASingleGraphRunner] = dict()
         self.has_try_compile_model: bool = False
 
         # strategy factory
@@ -187,7 +187,7 @@ def _get_capture_tokens(self, batch_size: int):
                 return size
         assert False, f'Unsupported batch_size={batch_size}'
 
-    def get_graph_key(self, input_ids: torch.Tensor, position_ids: torch.Tensor, past_key_values: List,
+    def get_graph_key(self, input_ids: torch.Tensor, position_ids: torch.Tensor, past_key_values: list,
                       attn_metadata: TritonAttentionMetadata, inputs_embeds: torch.Tensor, **kwargs):
         """Get graph key."""
         context = self.ctx_mgr.current_context()
@@ -261,7 +261,7 @@ def __call__(self, **kwargs):
     @record_function('prepare_inputs_for_generation')
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         inputs_embeds: torch.Tensor = None,
         context: StepContext = None,
     ):
@@ -303,6 +303,6 @@ def update_inputs(self, inputs):
             dp_meta.sync_tp_size(tp_size)
         return inputs
 
-    def get_capture_batch_sizes(self) -> List[int]:
+    def get_capture_batch_sizes(self) -> list[int]:
         """Capture batch sizes."""
         return _get_capture_batch_size_impl(self.cache_config.max_batches)
diff --git a/lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py b/lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py
index 8810f57a7e..997cb286d1 100644
--- a/lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py
+++ b/lmdeploy/pytorch/backends/cuda/moe/blocked_fp8.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Callable, List
+from collections.abc import Callable
 
 import torch
 import torch.distributed as dist
@@ -53,7 +53,7 @@ def forward(self,
                 down_scale: torch.Tensor,
                 gate_up_bias: torch.Tensor = None,
                 down_bias: torch.Tensor = None,
-                expert_list: List[int] = None,
+                expert_list: list[int] = None,
                 act_func: Callable = None):
         """forward."""
         input_size = hidden_states.shape
@@ -148,7 +148,7 @@ def forward(self,
                 down_scale: torch.Tensor,
                 gate_up_bias: torch.Tensor = None,
                 down_bias: torch.Tensor = None,
-                expert_list: List[int] = None,
+                expert_list: list[int] = None,
                 act_func: Callable = None,
                 **kwargs):
         """forward."""
diff --git a/lmdeploy/pytorch/backends/cuda/moe/default.py b/lmdeploy/pytorch/backends/cuda/moe/default.py
index ceef74a2b1..c47ba6edf8 100644
--- a/lmdeploy/pytorch/backends/cuda/moe/default.py
+++ b/lmdeploy/pytorch/backends/cuda/moe/default.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Callable, List, Optional
+from collections.abc import Callable
 
 import torch
 
@@ -47,7 +47,7 @@ def forward(self,
                 down_weights: torch.Tensor,
                 gate_up_bias: torch.Tensor = None,
                 down_bias: torch.Tensor = None,
-                expert_list: List[int] = None,
+                expert_list: list[int] = None,
                 act_func: Callable = None):
         """forward."""
         expert_offset = 0
@@ -103,7 +103,7 @@ def forward(
         topk_ids: torch.LongTensor,
         up_weights: torch.Tensor,
         down_weights: torch.Tensor,
-        expert_list: List[int] = None,
+        expert_list: list[int] = None,
     ):
         """forward."""
         from lmdeploy.pytorch.kernels.cuda.fused_moe_ep import fused_moe_v3
@@ -129,7 +129,7 @@ def dispatch_async(self,
                        x: torch.Tensor,
                        topk_idx: torch.Tensor,
                        topk_weights: torch.Tensor,
-                       num_experts: Optional[int] = None,
+                       num_experts: int | None = None,
                        previous_event=None,
                        async_finish=True):
         return self.token_dispatcher.dispatch_normal_async(x, topk_idx, topk_weights, num_experts, previous_event,
@@ -201,7 +201,7 @@ def dispatch_async_ll(
     self,
     hidden_states: torch.Tensor,
     topk_idx: torch.Tensor,
-    num_experts: Optional[int] = None,
+    num_experts: int | None = None,
     use_fp8: bool = True,
     async_finish: bool = True,
 ):
@@ -282,7 +282,7 @@ def forward(self,
                 topk_ids: torch.LongTensor,
                 up_weights: torch.Tensor,
                 down_weights: torch.Tensor,
-                expert_list: List[int] = None):
+                expert_list: list[int] = None):
         """forward."""
         recv_hidden_states, topk_idx, topk_weights, masked_m, expected_m = dispatch_ll(
             self.token_dispatcher,
@@ -304,7 +304,7 @@ def dispatch_async(
         self,
         hidden_states: torch.Tensor,
         topk_idx: torch.Tensor,
-        num_experts: Optional[int] = None,
+        num_experts: int | None = None,
         use_fp8: bool = False,
         async_finish: bool = True,
     ):
@@ -406,7 +406,7 @@ def forward(self,
                 down_weights: torch.Tensor,
                 gate_up_bias: torch.Tensor = None,
                 down_bias: torch.Tensor = None,
-                expert_list: List[int] = None,
+                expert_list: list[int] = None,
                 act_func: Callable = None):
         """forward."""
         assert act_func is None, 'Activation function is not supported in DeepEP MoE.'
diff --git a/lmdeploy/pytorch/backends/cuda/moe/ep_utils.py b/lmdeploy/pytorch/backends/cuda/moe/ep_utils.py
index f4c596a99c..68a2889b09 100644
--- a/lmdeploy/pytorch/backends/cuda/moe/ep_utils.py
+++ b/lmdeploy/pytorch/backends/cuda/moe/ep_utils.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
 
 import torch
 from torch import distributed as dist
@@ -34,7 +33,7 @@ def split_inputs_by_attn_tp(
     return hidden_states, topk_weights, topk_ids, split_size
 
 
-def gather_outputs_by_attn_tp(out_states: torch.Tensor, split_size: List[int]):
+def gather_outputs_by_attn_tp(out_states: torch.Tensor, split_size: list[int]):
     """Gather output by attn tp."""
     if split_size is None:
         return out_states
diff --git a/lmdeploy/pytorch/backends/cuda/moe/w8a8.py b/lmdeploy/pytorch/backends/cuda/moe/w8a8.py
index 19358f9751..d103d5f270 100644
--- a/lmdeploy/pytorch/backends/cuda/moe/w8a8.py
+++ b/lmdeploy/pytorch/backends/cuda/moe/w8a8.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import List
 
 import torch
 
@@ -43,7 +42,7 @@ def forward(self,
                 gate_up_scale: torch.Tensor,
                 down_weights: torch.Tensor,
                 down_scale: torch.Tensor,
-                expert_list: List[int] = None):
+                expert_list: list[int] = None):
         """forward."""
 
         if isinstance(hidden_states, torch.Tensor):
diff --git a/lmdeploy/pytorch/backends/cuda/moe_router.py b/lmdeploy/pytorch/backends/cuda/moe_router.py
index c0fbcd2a1a..a56f7d9df2 100644
--- a/lmdeploy/pytorch/backends/cuda/moe_router.py
+++ b/lmdeploy/pytorch/backends/cuda/moe_router.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
 
 import torch
 
@@ -57,7 +56,7 @@ def should_enable_custom_kernel(self) -> bool:
 
         return True
 
-    def forward(self, logits: torch.Tensor, bias: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(self, logits: torch.Tensor, bias: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         """Router forward."""
         if self.enable_custom_kernel:
             return fused_noaux_tc_routing(
diff --git a/lmdeploy/pytorch/backends/cuda/op_backend.py b/lmdeploy/pytorch/backends/cuda/op_backend.py
index ff3075f255..853db099d9 100644
--- a/lmdeploy/pytorch/backends/cuda/op_backend.py
+++ b/lmdeploy/pytorch/backends/cuda/op_backend.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
 
 import torch
 
@@ -93,7 +92,7 @@ def get_k_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         """Get k block shape."""
         return (
             block_size,
@@ -107,7 +106,7 @@ def get_v_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         """Get v block shape."""
         return (
             block_size,
diff --git a/lmdeploy/pytorch/backends/cuda/qmodules.py b/lmdeploy/pytorch/backends/cuda/qmodules.py
index dc61787731..96f73eb28c 100644
--- a/lmdeploy/pytorch/backends/cuda/qmodules.py
+++ b/lmdeploy/pytorch/backends/cuda/qmodules.py
@@ -1,11 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
 
 import torch
 
 import lmdeploy.pytorch.distributed as dist
-from lmdeploy.pytorch.kernels.cuda.w8a8_triton_kernels import (matmul_kernel_dynamic_quant, per_token_quant_int8,
-                                                               rms_norm_dynamic_quant)
+from lmdeploy.pytorch.kernels.cuda.w8a8_triton_kernels import (
+    matmul_kernel_dynamic_quant,
+    per_token_quant_int8,
+    rms_norm_dynamic_quant,
+)
 from lmdeploy.pytorch.models.q_modules import QTensor
 
 from ..qmodules import LinearW8A8Builder, LinearW8A8Impl, RMSNormW8A8Builder, RMSNormW8A8Impl
@@ -62,9 +64,9 @@ def forward(self,
                 x,
                 weight: torch.Tensor,
                 scale: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
-                group: Optional[torch.distributed.ProcessGroup] = None):
+                group: torch.distributed.ProcessGroup | None = None):
         """forward."""
         if isinstance(x, torch.Tensor):
             input_quant, input_scale = per_token_quant_int8(x, 1e-7, quant_dtype=self.quant_dtype)
diff --git a/lmdeploy/pytorch/backends/cuda/token_dispatcher.py b/lmdeploy/pytorch/backends/cuda/token_dispatcher.py
index ce4c9307d6..15176d14e2 100644
--- a/lmdeploy/pytorch/backends/cuda/token_dispatcher.py
+++ b/lmdeploy/pytorch/backends/cuda/token_dispatcher.py
@@ -9,7 +9,6 @@
 except ImportError:
     use_deepep = False
 
-from typing import List, Optional, Tuple
 
 import torch
 import torch.distributed as dist
@@ -137,9 +136,9 @@ def dispatch(
         hidden_states: torch.Tensor,
         topk_idx: torch.Tensor,
         topk_weights: torch.Tensor,
-        expert_list: List[int] = None,
+        expert_list: list[int] = None,
         previous_event=None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         self.hidden_shape = hidden_states.shape
         topk_idx = topk_idx.to(torch.int64)
         (
@@ -218,7 +217,7 @@ def dispatch_normal_async(self,
                               x: torch.Tensor,
                               topk_idx: torch.Tensor,
                               topk_weights: torch.Tensor,
-                              num_experts: Optional[int] = None,
+                              num_experts: int | None = None,
                               previous_event=None,
                               async_finish=True):
         (
@@ -271,7 +270,7 @@ def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
         self.handle = None
         return hidden_states.view(self.hidden_shape)
 
-    def combine_normal(self, x: torch.Tensor, handle: Tuple, previous_event=None):
+    def combine_normal(self, x: torch.Tensor, handle: tuple, previous_event=None):
         combined_x, _, event = self.buffer_normal.combine(
             x,
             handle,
@@ -281,7 +280,7 @@ def combine_normal(self, x: torch.Tensor, handle: Tuple, previous_event=None):
         )
         return combined_x, event
 
-    def combine_normal_async(self, x: torch.Tensor, handle: Tuple, previous_event=None, async_finish=True):
+    def combine_normal_async(self, x: torch.Tensor, handle: tuple, previous_event=None, async_finish=True):
         combined_x, _, event = self.buffer_normal.combine(
             x,
             handle,
@@ -307,9 +306,9 @@ def get_number_of_tokens_per_expert(self) -> torch.Tensor:
 
     def get_permuted_hidden_states_by_experts(self,
                                               hidden_states: torch.Tensor,
-                                              topk_idx: Optional[torch.Tensor] = None,
-                                              topk_weights: Optional[torch.Tensor] = None,
-                                              num_experts: Optional[int] = None) -> torch.Tensor:
+                                              topk_idx: torch.Tensor | None = None,
+                                              topk_weights: torch.Tensor | None = None,
+                                              num_experts: int | None = None) -> torch.Tensor:
         (dispatched_routing_map,
          topk_weights) = super().indices_to_multihot(self.topk_idx if topk_idx is None else topk_idx,
                                                      self.topk_weights if topk_weights is None else topk_weights,
@@ -328,10 +327,10 @@ def get_permuted_hidden_states_by_experts(self,
     def get_restored_hidden_states_by_experts(
         self,
         hidden_states: torch.Tensor,
-        reversed_mapping_for_combine: Optional[torch.Tensor] = None,
-        hidden_shape_before_permute: Optional[torch.Size] = None,
-        dispatched_routing_map: Optional[torch.Tensor] = None,
-        topk_weights: Optional[torch.Tensor] = None,
+        reversed_mapping_for_combine: torch.Tensor | None = None,
+        hidden_shape_before_permute: torch.Size | None = None,
+        dispatched_routing_map: torch.Tensor | None = None,
+        topk_weights: torch.Tensor | None = None,
     ) -> torch.Tensor:
         input_dtype = hidden_states.dtype
         assert (self.topk_weights.dtype == torch.float32), 'DeepEP only supports float32 probs'
@@ -381,7 +380,7 @@ def dispatch(
         topk_idx: torch.Tensor,
         topk_weights: torch.Tensor,
         num_experts: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         topk_idx = topk_idx.to(torch.int64)
         expected_m = (hidden_states.shape[0] * self.buffer_low_latency.group_size * topk_idx.shape[1] +
                       num_experts) // num_experts
@@ -408,7 +407,7 @@ def dispatch_async(
         self,
         hidden_states: torch.Tensor,
         topk_idx: torch.Tensor,
-        num_experts: Optional[int] = None,
+        num_experts: int | None = None,
         use_fp8: bool = True,
         async_finish: bool = True,
     ):
@@ -429,7 +428,7 @@ def combine(
         hidden_states: torch.Tensor,
         topk_idx: torch.Tensor,
         topk_weights: torch.Tensor,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         combined_hidden_states, event, hook = (self.buffer_low_latency.low_latency_combine(
             hidden_states,
             topk_idx,
@@ -446,9 +445,9 @@ def combine_async(
         hidden_states: torch.Tensor,
         topk_idx: torch.Tensor,
         topk_weights: torch.Tensor,
-        handle: Tuple,
+        handle: tuple,
         async_finish: bool,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
         assert topk_idx.dtype == torch.int64
         assert topk_weights.dtype == torch.float32
         combined_hidden_states, event, hook = self.buffer_low_latency.low_latency_combine(
diff --git a/lmdeploy/pytorch/backends/default/awq_modules.py b/lmdeploy/pytorch/backends/default/awq_modules.py
index d2253920fa..1837f6d65e 100644
--- a/lmdeploy/pytorch/backends/default/awq_modules.py
+++ b/lmdeploy/pytorch/backends/default/awq_modules.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from functools import lru_cache
-from typing import Optional
 
 import torch
 
@@ -61,9 +60,9 @@ def forward(self,
                 qweight: torch.Tensor,
                 scales: torch.Tensor,
                 qzeros: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
-                group: Optional[torch.distributed.ProcessGroup] = None):
+                group: torch.distributed.ProcessGroup | None = None):
         """forward."""
         out_shape = x.shape[:-1] + (self.out_features, )
         input_dtype = x.dtype
diff --git a/lmdeploy/pytorch/backends/default/linear.py b/lmdeploy/pytorch/backends/default/linear.py
index f766123fff..7823d26566 100644
--- a/lmdeploy/pytorch/backends/default/linear.py
+++ b/lmdeploy/pytorch/backends/default/linear.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Optional
 
 import torch
 import torch.distributed as dist
@@ -14,11 +13,11 @@ class DefaultLinearImpl(LinearImpl):
     def forward(self,
                 x,
                 weight: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
                 group: dist.ProcessGroup = None,
                 rank: int = 0,
-                scatter_size: List[int] = None):
+                scatter_size: list[int] = None):
         """forward."""
         out = F.linear(x, weight, bias)
         if all_reduce:
diff --git a/lmdeploy/pytorch/backends/default/moe_router.py b/lmdeploy/pytorch/backends/default/moe_router.py
index 34d982a7b0..7ff818965a 100644
--- a/lmdeploy/pytorch/backends/default/moe_router.py
+++ b/lmdeploy/pytorch/backends/default/moe_router.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import functools
-from typing import Tuple
 
 import torch
 
@@ -52,7 +51,7 @@ def __init__(
         # n_group
         self.router_n_groups = router_n_groups
 
-    def _forward_router_n_groups(self, scores_for_choice: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def _forward_router_n_groups(self, scores_for_choice: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         assert scores_for_choice.shape[-1] % self.router_n_groups == 0, \
             f'{scores_for_choice.shape[-1]} cannot be divided by {self.router_n_groups}'
         per_group_top_k = self.top_k // self.router_n_groups
@@ -65,7 +64,7 @@ def _forward_router_n_groups(self, scores_for_choice: torch.Tensor) -> Tuple[tor
         return topk_weight, topk_idx
 
     def _forward_default(self, scores: torch.Tensor, scores_for_choice: torch.Tensor,
-                         sequence_length: int) -> Tuple[torch.Tensor, torch.Tensor]:
+                         sequence_length: int) -> tuple[torch.Tensor, torch.Tensor]:
         group_scores = (scores_for_choice.view(sequence_length, self.n_group,
                                                -1).topk(2, dim=-1)[0].sum(dim=-1))  # [n, n_group]
         group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[1]  # [n, top_k_group]
@@ -90,7 +89,7 @@ def renorm(self, topk_weight: torch.Tensor) -> torch.Tensor:
         topk_weight = topk_weight * self.routed_scaling_factor
         return topk_weight
 
-    def forward(self, logits: torch.Tensor, bias: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(self, logits: torch.Tensor, bias: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         """Router forward."""
         sequence_length = logits.shape[0]
 
diff --git a/lmdeploy/pytorch/backends/default/op_backend.py b/lmdeploy/pytorch/backends/default/op_backend.py
index 84badaa9e3..6bfb9e5934 100644
--- a/lmdeploy/pytorch/backends/default/op_backend.py
+++ b/lmdeploy/pytorch/backends/default/op_backend.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
 
 import torch
 
@@ -60,7 +59,7 @@ def get_k_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         """Get block shape of k."""
         return (
             block_size,
@@ -74,7 +73,7 @@ def get_v_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         """Get block shape of v."""
         return (
             block_size,
diff --git a/lmdeploy/pytorch/backends/default/rotary_embedding.py b/lmdeploy/pytorch/backends/default/rotary_embedding.py
index e37caa52a7..a20d9ea7dc 100644
--- a/lmdeploy/pytorch/backends/default/rotary_embedding.py
+++ b/lmdeploy/pytorch/backends/default/rotary_embedding.py
@@ -7,8 +7,15 @@
 import torch.nn.functional as F
 from torch import nn
 
-from ..rotary_embedding import (FopeParameters, Llama3Parameters, LongRoPEScalingParameters, RopeType,
-                                RotaryEmbeddingBuilder, RotaryEmbeddingImpl, YarnParameters)
+from ..rotary_embedding import (
+    FopeParameters,
+    Llama3Parameters,
+    LongRoPEScalingParameters,
+    RopeType,
+    RotaryEmbeddingBuilder,
+    RotaryEmbeddingImpl,
+    YarnParameters,
+)
 
 
 def safe_torch_compile(**compile_kwargs):
diff --git a/lmdeploy/pytorch/backends/default/token_dispatcher.py b/lmdeploy/pytorch/backends/default/token_dispatcher.py
index f8436f3838..256fe5707c 100644
--- a/lmdeploy/pytorch/backends/default/token_dispatcher.py
+++ b/lmdeploy/pytorch/backends/default/token_dispatcher.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
 
 import torch
 
@@ -80,7 +79,7 @@ def preprocess(self, routing_map: torch.Tensor, local_expert_indices) -> torch.T
         return num_tokens_per_local_expert
 
     def dispatch(self, hidden_states: torch.Tensor, topk_ids: torch.Tensor, probs: torch.Tensor,
-                 local_expert_indices) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+                 local_expert_indices) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         self.hidden_shape = hidden_states.shape
         self.topk_ids = topk_ids
         self.routing_map, self.topk_weights = super().indices_to_multihot(topk_ids, probs, self.num_experts)
diff --git a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
index 484cbd1b72..169da9c150 100644
--- a/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/ascend/op_backend.py
@@ -6,7 +6,6 @@
 from dataclasses import dataclass
 from functools import lru_cache
 from pathlib import Path
-from typing import Dict, Tuple
 
 import torch
 import torch.distributed as dist
@@ -72,11 +71,11 @@ class DistMeta:
 
 class AscendKVQuantMeta:
     has_set_value: bool = False
-    quant_meta: Dict = {}
+    quant_meta: dict = {}
 
     @classmethod
     def set_value(cls, device: str, dtype: torch.dtype, record_file: str, total_layers: int):
-        with open(record_file, 'r') as file:
+        with open(record_file) as file:
             data = file.read()
         scale_offset_pairs = re.findall(r'scale:\s*([\d\.\-]+)\s*offset:\s*(-?\d+)', data)
         scale_offset_pairs = [(float(scale), float(offset)) for scale, offset in scale_offset_pairs]
@@ -133,7 +132,7 @@ def get_k_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         if SocVersion.is_Ascend910():
             return (block_size, num_heads, head_size)
         else:
@@ -145,7 +144,7 @@ def get_v_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         if SocVersion.is_Ascend910():
             return (block_size, num_heads, head_size)
         else:
diff --git a/lmdeploy/pytorch/backends/dlinfer/attention.py b/lmdeploy/pytorch/backends/dlinfer/attention.py
index 8566187021..78afe49040 100644
--- a/lmdeploy/pytorch/backends/dlinfer/attention.py
+++ b/lmdeploy/pytorch/backends/dlinfer/attention.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Dict, Optional, Sequence
 
 from torch import Tensor
 
@@ -10,14 +10,14 @@
 
 @dataclass
 class DlinferAttentionMetadata(AttentionMetadata):
-    kv_start_indices: Optional[Tensor] = None
+    kv_start_indices: Tensor | None = None
     block_size: int = 64
     attention_mask: Sequence[Tensor] = tuple()
-    is_unpaged_prefill: Optional[bool] = None
+    is_unpaged_prefill: bool | None = None
     max_q_seq_len: int = 1
     max_kv_seq_len: int = 1
-    quant_meta: Dict = None
-    cu_seq_lens_kv: Optional[Tensor] = None
+    quant_meta: dict = None
+    cu_seq_lens_kv: Tensor | None = None
 
 
 class DlinferAttentionImpl(AttentionImpl[DlinferAttentionMetadata]):
diff --git a/lmdeploy/pytorch/backends/dlinfer/awq_modules.py b/lmdeploy/pytorch/backends/dlinfer/awq_modules.py
index 1ec8bf0072..c9dcc381ec 100644
--- a/lmdeploy/pytorch/backends/dlinfer/awq_modules.py
+++ b/lmdeploy/pytorch/backends/dlinfer/awq_modules.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
 
 import torch
 
@@ -22,9 +21,9 @@ def forward(self,
                 qweight: torch.Tensor,
                 scales: torch.Tensor,
                 qzeros: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
-                group: Optional[torch.distributed.ProcessGroup] = None):
+                group: torch.distributed.ProcessGroup | None = None):
         """forward."""
         out = awq_linear(x, qweight, scales, qzeros, bias, all_reduce, self.group_size)
         return out
diff --git a/lmdeploy/pytorch/backends/dlinfer/camb/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/camb/op_backend.py
index a4ddc6fc93..18f04de73b 100644
--- a/lmdeploy/pytorch/backends/dlinfer/camb/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/camb/op_backend.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
 
 import torch
 
@@ -26,7 +25,7 @@ def get_k_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         return (
             num_heads,
             block_size,
@@ -39,7 +38,7 @@ def get_v_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         return (
             num_heads,
             block_size,
diff --git a/lmdeploy/pytorch/backends/dlinfer/linear.py b/lmdeploy/pytorch/backends/dlinfer/linear.py
index fbe717f5c2..e4e22d7f57 100644
--- a/lmdeploy/pytorch/backends/dlinfer/linear.py
+++ b/lmdeploy/pytorch/backends/dlinfer/linear.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
-from typing import List, Optional
 
 import torch
 import torch.distributed as dist
@@ -13,7 +12,7 @@
 class DlinferLinearImpl(LinearImpl):
     """Dlinfer linear implementation api."""
 
-    def update_weights(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
+    def update_weights(self, weight: torch.Tensor, bias: torch.Tensor | None = None):
         """Update weights."""
         if os.getenv('DLINFER_LINEAR_USE_NN_LAYOUT', '0') == '1':
             weight = weight.data.t().contiguous()
@@ -22,11 +21,11 @@ def update_weights(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = No
     def forward(self,
                 x,
                 weight: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
                 group: dist.ProcessGroup = None,
                 rank: int = 0,
-                scatter_size: List[int] = None):
+                scatter_size: list[int] = None):
         """forward."""
         out = linear(x, weight, bias, False)
         if all_reduce:
diff --git a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
index 3be4ab6f24..8420b159dc 100644
--- a/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/maca/op_backend.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
 
 import torch
 
@@ -26,7 +25,7 @@ def get_k_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         return (block_size, num_heads, head_size)
 
     @staticmethod
@@ -35,7 +34,7 @@ def get_v_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         return (block_size, num_heads, head_size)
 
     @classmethod
diff --git a/lmdeploy/pytorch/backends/dlinfer/moe.py b/lmdeploy/pytorch/backends/dlinfer/moe.py
index d9c83031e8..e0e5ccb04c 100644
--- a/lmdeploy/pytorch/backends/dlinfer/moe.py
+++ b/lmdeploy/pytorch/backends/dlinfer/moe.py
@@ -1,12 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
-from typing import Callable, List
+from collections.abc import Callable
 
 import torch
 
-from lmdeploy.pytorch.kernels.dlinfer import DlinferMoECommType  # noqa: F401
-from lmdeploy.pytorch.kernels.dlinfer import DlinferMoeMetadata  # noqa: F401
-from lmdeploy.pytorch.kernels.dlinfer import fused_moe, moe_gating_topk_softmax
+from lmdeploy.pytorch.kernels.dlinfer import (
+    DlinferMoECommType,  # noqa: F401
+    DlinferMoeMetadata,  # noqa: F401
+    fused_moe,
+    moe_gating_topk_softmax,
+)
 from lmdeploy.pytorch.model_inputs import get_step_ctx_manager
 
 from ..moe import FusedMoEBuilder, FusedMoEImpl, SoftmaxTopKBuilder, SoftmaxTopKImpl
@@ -85,7 +88,7 @@ def forward(self,
                 down_weights: torch.Tensor,
                 gate_up_bias: torch.Tensor = None,
                 down_bias: torch.Tensor = None,
-                expert_list: List[int] = None,
+                expert_list: list[int] = None,
                 act_func: Callable = None):
         """forward."""
         assert gate_up_bias is None
diff --git a/lmdeploy/pytorch/backends/dlinfer/op_backend.py b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
index 16eb604ccd..b01cc12596 100644
--- a/lmdeploy/pytorch/backends/dlinfer/op_backend.py
+++ b/lmdeploy/pytorch/backends/dlinfer/op_backend.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
 
 import torch
 
@@ -73,7 +72,7 @@ def get_k_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         return (
             block_size,
             num_heads,
@@ -86,7 +85,7 @@ def get_v_block_shape(
         num_heads: int,
         head_size: int,
         dtype: torch.dtype,
-    ) -> Tuple[int, ...]:
+    ) -> tuple[int, ...]:
         return (
             block_size,
             num_heads,
diff --git a/lmdeploy/pytorch/backends/dlinfer/qmodules.py b/lmdeploy/pytorch/backends/dlinfer/qmodules.py
index fe52dd5f35..af5594245d 100644
--- a/lmdeploy/pytorch/backends/dlinfer/qmodules.py
+++ b/lmdeploy/pytorch/backends/dlinfer/qmodules.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
-from typing import Optional
 
 import torch
 import torch.distributed as dist
@@ -24,7 +23,7 @@ def __init__(self,
         self.out_dtype = out_dtype
         self.quant_dtype = quant_dtype
 
-    def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: Optional[torch.Tensor] = None):
+    def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: torch.Tensor | None = None):
         """Update weights."""
         if os.getenv('DLINFER_LINEAR_USE_NN_LAYOUT', '0') == '1':
             weight = weight.data.t().contiguous()
@@ -35,9 +34,9 @@ def forward(self,
                 x,
                 weight: torch.Tensor,
                 scale: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
-                group: Optional[torch.distributed.ProcessGroup] = None):
+                group: torch.distributed.ProcessGroup | None = None):
         """forward."""
         if isinstance(x, torch.Tensor):
             input_quant, input_scale = dynamic_quant(x, self.quant_dtype)
diff --git a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
index bfad3a89a7..677bbdd86c 100644
--- a/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
+++ b/lmdeploy/pytorch/backends/dlinfer/rotary_embedding.py
@@ -5,10 +5,20 @@
 import torch
 from torch import nn
 
-from ..default.rotary_embedding import (FopeRotaryEmbeddingImpl, LlamaDynamicNTKScalingRotaryEmbedding,
-                                        YarnRotaryEmbeddingImpl)
-from ..rotary_embedding import (FopeParameters, Llama3Parameters, LongRoPEScalingParameters, RopeType,
-                                RotaryEmbeddingBuilder, RotaryEmbeddingImpl, YarnParameters)
+from ..default.rotary_embedding import (
+    FopeRotaryEmbeddingImpl,
+    LlamaDynamicNTKScalingRotaryEmbedding,
+    YarnRotaryEmbeddingImpl,
+)
+from ..rotary_embedding import (
+    FopeParameters,
+    Llama3Parameters,
+    LongRoPEScalingParameters,
+    RopeType,
+    RotaryEmbeddingBuilder,
+    RotaryEmbeddingImpl,
+    YarnParameters,
+)
 
 
 def _rotary_embedding_fwd(position_ids: torch.Tensor,
diff --git a/lmdeploy/pytorch/backends/graph_runner.py b/lmdeploy/pytorch/backends/graph_runner.py
index a88872f2bd..72f460ef5b 100644
--- a/lmdeploy/pytorch/backends/graph_runner.py
+++ b/lmdeploy/pytorch/backends/graph_runner.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import functools
 from dataclasses import dataclass
-from typing import List
 
 import torch
 
@@ -55,7 +54,7 @@ def get_logits(self, hidden_states: torch.Tensor):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         inputs_embeds: torch.Tensor = None,
         context: StepContext = None,
     ):
@@ -68,7 +67,7 @@ def prepare_inputs_for_generation(
 
     def update_model_metas(
         self,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         inputs_embeds: torch.Tensor = None,
         context: StepContext = None,
     ):
@@ -100,6 +99,6 @@ def get_meta(self):
     def update_inputs(self, inputs):
         return inputs
 
-    def get_capture_batch_sizes(self) -> List[int]:
+    def get_capture_batch_sizes(self) -> list[int]:
         """Capture batch sizes."""
         return _get_capture_batch_size_impl(self.cache_config.max_batches)
diff --git a/lmdeploy/pytorch/backends/linear.py b/lmdeploy/pytorch/backends/linear.py
index 740b4b7ecc..88d0b150f0 100644
--- a/lmdeploy/pytorch/backends/linear.py
+++ b/lmdeploy/pytorch/backends/linear.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
-from typing import List, Optional
 
 import torch
 import torch.distributed as dist
@@ -9,7 +8,7 @@
 class LinearImpl(ABC):
     """Linear implementation api."""
 
-    def update_weights(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
+    def update_weights(self, weight: torch.Tensor, bias: torch.Tensor | None = None):
         """Update weights."""
         return weight, bias
 
@@ -17,11 +16,11 @@ def update_weights(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = No
     def forward(self,
                 x,
                 weight: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
                 group: dist.ProcessGroup = None,
                 rank: int = 0,
-                scatter_size: List[int] = None):
+                scatter_size: list[int] = None):
         """forward."""
         raise NotImplementedError
 
diff --git a/lmdeploy/pytorch/backends/moe.py b/lmdeploy/pytorch/backends/moe.py
index 5b33b97da7..10a3c5e702 100644
--- a/lmdeploy/pytorch/backends/moe.py
+++ b/lmdeploy/pytorch/backends/moe.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import functools
 from abc import ABC, abstractmethod
-from typing import Callable, List, Optional
+from collections.abc import Callable
 
 import torch
 import torch.distributed as dist
@@ -52,7 +52,7 @@ def forward(self,
                 down_weights: torch.Tensor,
                 gate_up_bias: torch.Tensor = None,
                 down_bias: torch.Tensor = None,
-                expert_list: List[int] = None,
+                expert_list: list[int] = None,
                 act_func: Callable = None):
         """forward."""
         raise NotImplementedError
@@ -97,7 +97,7 @@ def forward(self,
                 gate_up_scale: torch.Tensor,
                 down_weights: torch.Tensor,
                 down_scale: torch.Tensor,
-                expert_list: List[int] = None):
+                expert_list: list[int] = None):
         """forward."""
         raise NotImplementedError
 
@@ -120,7 +120,7 @@ class FusedMoEBlockedF8Impl(ABC):
     """Fused moe blocked f8 implementation."""
 
     def __init__(self):
-        self.scale_fmt: Optional[str] = None
+        self.scale_fmt: str | None = None
 
     def update_weights(self, gate_up_weights: torch.Tensor, down_weights: torch.Tensor, gate_up_scale: torch.Tensor,
                        down_scale: torch.Tensor):
@@ -131,7 +131,7 @@ def ep_expert_list(self, world_size: int, rank: int):
         """Experts list of current rank."""
         raise NotImplementedError('Not Implemented.')
 
-    def set_scale_fmt(self, scale_fmt: Optional[str]):
+    def set_scale_fmt(self, scale_fmt: str | None):
         """Set scale fmt."""
         self.scale_fmt = scale_fmt
 
@@ -147,7 +147,7 @@ def forward(self,
                 down_scale: torch.Tensor,
                 gate_up_bias: torch.Tensor = None,
                 down_bias: torch.Tensor = None,
-                expert_list: List[int] = None,
+                expert_list: list[int] = None,
                 act_func: Callable = None):
         """forward."""
         raise NotImplementedError
diff --git a/lmdeploy/pytorch/backends/moe_router.py b/lmdeploy/pytorch/backends/moe_router.py
index 87cb90a7cc..e523ee9a56 100644
--- a/lmdeploy/pytorch/backends/moe_router.py
+++ b/lmdeploy/pytorch/backends/moe_router.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
-from typing import Tuple
 
 import torch
 
@@ -9,7 +8,7 @@ class RouterNoauxTCImpl(ABC):
     """Noaux tc implementation api."""
 
     @abstractmethod
-    def forward(self, logits: torch.Tensor, bias: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward(self, logits: torch.Tensor, bias: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         """forward."""
         raise NotImplementedError
 
diff --git a/lmdeploy/pytorch/backends/qmodules.py b/lmdeploy/pytorch/backends/qmodules.py
index 7173fb5f34..4b98da7abb 100644
--- a/lmdeploy/pytorch/backends/qmodules.py
+++ b/lmdeploy/pytorch/backends/qmodules.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 
@@ -37,7 +36,7 @@ def build(hidden_size: int, eps: float = 1e-6, quant_dtype: torch.dtype = torch.
 class LinearW8A8Impl(ABC):
     """Linear w8a8 implementation api."""
 
-    def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: Optional[torch.Tensor] = None):
+    def update_weights(self, weight: torch.Tensor, scale: torch.Tensor, bias: torch.Tensor | None = None):
         """Update weights."""
         return weight, scale, bias
 
@@ -46,9 +45,9 @@ def forward(self,
                 x,
                 weight: torch.Tensor,
                 scale: torch.Tensor,
-                bias: Optional[torch.Tensor] = None,
+                bias: torch.Tensor | None = None,
                 all_reduce: bool = False,
-                group: Optional[torch.distributed.ProcessGroup] = None):
+                group: torch.distributed.ProcessGroup | None = None):
         """forward."""
         raise NotImplementedError
 
diff --git a/lmdeploy/pytorch/backends/rotary_embedding.py b/lmdeploy/pytorch/backends/rotary_embedding.py
index 7495e39b75..16b9d7c799 100644
--- a/lmdeploy/pytorch/backends/rotary_embedding.py
+++ b/lmdeploy/pytorch/backends/rotary_embedding.py
@@ -2,7 +2,6 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from enum import Enum, auto
-from typing import List
 
 import torch
 
@@ -32,8 +31,8 @@ class YarnParameters:
 @dataclass
 class LongRoPEScalingParameters:
     """Long Ropescaling parameters."""
-    short_factor: List[int]
-    long_factor: List[int]
+    short_factor: list[int]
+    long_factor: list[int]
     original_max_position_embeddings: int
     long_mscale: float = None
     short_mscale: float = None
diff --git a/lmdeploy/pytorch/backends/token_dispatcher.py b/lmdeploy/pytorch/backends/token_dispatcher.py
index 9d831f97ba..34a4136f2c 100644
--- a/lmdeploy/pytorch/backends/token_dispatcher.py
+++ b/lmdeploy/pytorch/backends/token_dispatcher.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
-from typing import Tuple
 
 import torch
 
@@ -63,7 +62,7 @@ def indices_to_multihot(self, topk_ids, topk_weight, num_experts):
 
     @abstractmethod
     def dispatch(self, hidden_states: torch.Tensor, probs: torch.Tensor, topk_ids: torch.Tensor,
-                 local_expert_indices) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+                 local_expert_indices) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """dispatch."""
         raise NotImplementedError
 
diff --git a/lmdeploy/pytorch/check_env/base.py b/lmdeploy/pytorch/check_env/base.py
index f40dba96bf..497fd2db54 100644
--- a/lmdeploy/pytorch/check_env/base.py
+++ b/lmdeploy/pytorch/check_env/base.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from logging import Logger
-from typing import List
 
 from lmdeploy.utils import can_colorize, get_logger
 
@@ -23,7 +22,7 @@ def __init__(self, logger: Logger = None):
             logger = get_logger('lmdeploy')
         self.logger = logger
         self._is_passed = False
-        self._required_checker: List[BaseChecker] = list()
+        self._required_checker: list[BaseChecker] = list()
 
     def get_logger(self):
         """Get logger."""
diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
index d80045ab92..93f744fc4b 100644
--- a/lmdeploy/pytorch/config.py
+++ b/lmdeploy/pytorch/config.py
@@ -1,7 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import enum
+from collections.abc import Callable
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple
+from typing import Any, Literal
 
 import torch
 
@@ -91,7 +92,7 @@ class CacheConfig:
     quant_policy: Literal[0, 4, 8] = 0
     device_type: str = 'cuda'
     num_state_caches: int = None
-    states_shapes: List[Tuple] = field(default_factory=list)
+    states_shapes: list[tuple] = field(default_factory=list)
 
     # reserved blocks for dummy inputs, init to 0 for unit test.
     num_reserved_gpu_blocks: int = 0
@@ -254,7 +255,7 @@ def _override_hf_config(hf_config: Any, key: str, hf_overrides):
         _overide_hf_config_cfg(hf_config, key, hf_overrides)
 
 
-def override_hf_config(hf_config: Any, hf_overrides: Dict[str, Any]):
+def override_hf_config(hf_config: Any, hf_overrides: dict[str, Any]):
     """Override HF config."""
     for k, v in hf_overrides.items():
         _override_hf_config(hf_config, k, v)
@@ -302,7 +303,7 @@ class ModelConfig:
     num_attention_heads: int
     num_key_value_heads: int
     bos_token_id: int
-    eos_token_id: List[int]
+    eos_token_id: list[int]
     head_dim: int
     k_head_dim: int = None
     v_head_dim: int = None
@@ -312,12 +313,12 @@ class ModelConfig:
     hf_config: Any = None
     llm_config: Any = None
     cogvlm_style: bool = False
-    custom_module_map: Dict[str, setattr] = None
+    custom_module_map: dict[str, setattr] = None
 
     # flash mla
     use_flash_mla: bool = False
     use_mla_fp8_cache: bool = False
-    mla_index_topk: Optional[int] = None
+    mla_index_topk: int | None = None
 
     # dllm
     model_paradigm: str = 'ar'
@@ -326,10 +327,10 @@ class ModelConfig:
 
     # Added for deepseekv3.2 nsa index
     # caches would be added after kv cache
-    cache_shapes: List[Tuple[List[int], torch.dtype]] = field(default_factory=list)
+    cache_shapes: list[tuple[list[int], torch.dtype]] = field(default_factory=list)
     # added for qwen3_next
     # could used for any SSM model.
-    states_shapes: List[Tuple[Tuple[int], torch.dtype]] = field(default_factory=list)
+    states_shapes: list[tuple[tuple[int], torch.dtype]] = field(default_factory=list)
 
     # check env for model-device combination
     check_env_func: Callable = _default_check_env
@@ -352,7 +353,7 @@ def from_pretrained(
         trust_remote_code: bool = True,
         dtype: str = 'auto',
         dist_config: DistConfig = None,
-        hf_overrides: Dict[str, Any] = None,
+        hf_overrides: dict[str, Any] = None,
         is_draft_model: bool = False,
         spec_method: str = None,
         model_format: str = None,
@@ -366,7 +367,7 @@ def from_pretrained(
                 models defined on the Hub in their own modeling files.
             dtype (str): user specified data type for model weights and
                 activations. Refer to `PyTorchEngineConfig` for details
-            hf_overrides (Dict[str, Any]): overrides for the HF config.
+            hf_overrides (dict[str, Any]): overrides for the HF config.
         """
         from transformers import AutoConfig
 
@@ -488,7 +489,7 @@ class MiscConfig:
     custom_module_map: str = None
     empty_init: bool = False
     model_format: str = None
-    hf_overrides: Dict[str, Any] = None
+    hf_overrides: dict[str, Any] = None
     disable_vision_encoder: bool = False
     logprobs_mode: str = None
     dllm_config: DLLMConfig = None
@@ -571,10 +572,10 @@ class QuantizationConfig:
     scale_fmt: str = None
     bits: int = None
     group_size: int = None
-    weight_block_size: Tuple[int] = None
+    weight_block_size: tuple[int] = None
     activation_scheme: str = None
-    ignored_layers: List[str] = field(default_factory=list)
-    hf_quant_config: Dict[str, Any] = field(default_factory=dict)
+    ignored_layers: list[str] = field(default_factory=list)
+    hf_quant_config: dict[str, Any] = field(default_factory=dict)
 
     @classmethod
     def from_config(cls, hf_config: Any):
diff --git a/lmdeploy/pytorch/configurations/__init__.py b/lmdeploy/pytorch/configurations/__init__.py
index 697df755d6..703fe98b3d 100644
--- a/lmdeploy/pytorch/configurations/__init__.py
+++ b/lmdeploy/pytorch/configurations/__init__.py
@@ -9,7 +9,7 @@
 # load all submodule
 for loader, module_name, is_pkg in pkgutil.walk_packages(__path__):
     __all__.append(module_name)
-    _module = importlib.import_module('{}.{}'.format(__name__, module_name))
+    _module = importlib.import_module(f'{__name__}.{module_name}')
     globals()[module_name] = _module
 
 __all__ += ['AutoModelConfigBuilder']
diff --git a/lmdeploy/pytorch/devices/device_manager.py b/lmdeploy/pytorch/devices/device_manager.py
index 91fd50fbe2..2a0584c3da 100644
--- a/lmdeploy/pytorch/devices/device_manager.py
+++ b/lmdeploy/pytorch/devices/device_manager.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Callable
 
 from lmdeploy.pytorch.utils import CtxMgrBase, singleton
 
diff --git a/lmdeploy/pytorch/disagg/backend/base.py b/lmdeploy/pytorch/disagg/backend/base.py
index 8df1d118c0..c10e712f40 100644
--- a/lmdeploy/pytorch/disagg/backend/base.py
+++ b/lmdeploy/pytorch/disagg/backend/base.py
@@ -1,8 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import abstractmethod
 
-from lmdeploy.pytorch.disagg.conn.protocol import (DistServeInitRequest, DistServeKVTransferEndpointInfo,
-                                                   MigrationProtocol)
+from lmdeploy.pytorch.disagg.conn.protocol import (
+    DistServeInitRequest,
+    DistServeKVTransferEndpointInfo,
+    MigrationProtocol,
+)
 from lmdeploy.pytorch.disagg.messages import DistServeRegisterMRMessage, MigrationAssignment
 
 
diff --git a/lmdeploy/pytorch/disagg/backend/dlslime.py b/lmdeploy/pytorch/disagg/backend/dlslime.py
index c8a8454d35..34be90c687 100644
--- a/lmdeploy/pytorch/disagg/backend/dlslime.py
+++ b/lmdeploy/pytorch/disagg/backend/dlslime.py
@@ -2,7 +2,6 @@
 import asyncio
 import json
 import os
-from typing import Dict
 
 from dlslime import RDMAEndpoint, available_nic
 
@@ -10,8 +9,11 @@
 from lmdeploy.pytorch.disagg.backend.backend import MIGRATION_BACKENDS
 from lmdeploy.pytorch.disagg.backend.base import MigrationBackendImpl
 from lmdeploy.pytorch.disagg.config import DistServeEngineConfig, MigrationBackend
-from lmdeploy.pytorch.disagg.conn.protocol import (DistServeInitRequest, DistServeKVTransferEndpointInfo,
-                                                   MigrationProtocol)
+from lmdeploy.pytorch.disagg.conn.protocol import (
+    DistServeInitRequest,
+    DistServeKVTransferEndpointInfo,
+    MigrationProtocol,
+)
 from lmdeploy.pytorch.disagg.messages import DistServeRegisterMRMessage, MigrationAssignment
 
 logger = get_logger('lmdeploy')
@@ -25,7 +27,7 @@ def __init__(self, init_request: DistServeInitRequest):
         self.rank = init_request.rank
         self.local_engine_config: DistServeEngineConfig = (init_request.local_engine_config)
         self.remote_engine_config: DistServeEngineConfig = (init_request.remote_engine_config)
-        self.endpoint: Dict[MigrationProtocol, RDMAEndpoint] = {}
+        self.endpoint: dict[MigrationProtocol, RDMAEndpoint] = {}
         if init_request.protocol == MigrationProtocol.RDMA:
             nics = available_nic()
             device_name = nics[self.rank % len(nics)]
@@ -76,7 +78,7 @@ class DLSlimeBackend(MigrationBackendImpl):
     """DLSlime Transfer Engine."""
 
     def __init__(self):
-        self.links: Dict[str, DLSlimeMigrationManagement] = {}
+        self.links: dict[str, DLSlimeMigrationManagement] = {}
 
     def p2p_initialize(self, init_request: DistServeInitRequest):
         self.links[init_request.remote_engine_id] = DLSlimeMigrationManagement(init_request)
diff --git a/lmdeploy/pytorch/disagg/backend/mooncake.py b/lmdeploy/pytorch/disagg/backend/mooncake.py
index e4ba7fbd5f..e33056b717 100644
--- a/lmdeploy/pytorch/disagg/backend/mooncake.py
+++ b/lmdeploy/pytorch/disagg/backend/mooncake.py
@@ -4,13 +4,15 @@
 import os
 import socket
 import subprocess
-from typing import Dict
 
 from lmdeploy.pytorch.disagg.backend.backend import MIGRATION_BACKENDS
 from lmdeploy.pytorch.disagg.backend.base import MigrationBackendImpl
 from lmdeploy.pytorch.disagg.config import MigrationBackend, MooncakeEngineConfig
-from lmdeploy.pytorch.disagg.conn.protocol import (DistServeInitRequest, DistServeKVTransferEndpointInfo,
-                                                   MigrationProtocol)
+from lmdeploy.pytorch.disagg.conn.protocol import (
+    DistServeInitRequest,
+    DistServeKVTransferEndpointInfo,
+    MigrationProtocol,
+)
 from lmdeploy.pytorch.disagg.messages import DistServeRegisterMRMessage, MigrationAssignment
 from lmdeploy.utils import get_logger
 
@@ -88,8 +90,8 @@ def __init__(self, init_request: DistServeInitRequest):
         # Get all RDMA information once during initialization
         self.ibv_devices = get_rdma_nics()
 
-        self.local_kv_table: Dict[str, Dict] = {}
-        self.remote_kv_table: Dict[str, Dict] = {}
+        self.local_kv_table: dict[str, dict] = {}
+        self.remote_kv_table: dict[str, dict] = {}
         self.remote_url: str = ''  # Store remote URL for this connection
 
         # Initialize the p2p connection
@@ -142,7 +144,7 @@ def register_memory_region(self, register_mr_request: DistServeRegisterMRMessage
                     f'addr: {buffer_addr}, length: {buffer_length} for remote_engine_id {self.remote_engine_id}')
 
     @property
-    def endpoint_info(self) -> Dict:
+    def endpoint_info(self) -> dict:
         """Get endpoint information for this connection."""
 
         mr_info = {}
@@ -237,7 +239,7 @@ class MooncakeBackend(MigrationBackendImpl):
     """Mooncake backend that manages multiple migration connections."""
 
     def __init__(self):
-        self.links: Dict[int, MooncakeMigrationManagement] = {}
+        self.links: dict[int, MooncakeMigrationManagement] = {}
 
     def p2p_initialize(self, init_request: DistServeInitRequest):
         self.links[init_request.remote_engine_id] = MooncakeMigrationManagement(init_request)
diff --git a/lmdeploy/pytorch/disagg/config.py b/lmdeploy/pytorch/disagg/config.py
index f4dd002231..a02a831bcd 100644
--- a/lmdeploy/pytorch/disagg/config.py
+++ b/lmdeploy/pytorch/disagg/config.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import enum
-from typing import Optional
 
 from pydantic import BaseModel
 
@@ -98,7 +97,7 @@ class DistServeEngineConfig(BaseModel):
     tp_size: int
     ep_size: int
     dp_size: int
-    pp_size: Optional[int]
+    pp_size: int | None
 
     # Rank of DP
     dp_rank: int
diff --git a/lmdeploy/pytorch/disagg/conn/engine_conn.py b/lmdeploy/pytorch/disagg/conn/engine_conn.py
index 0312df05bd..191d5690a2 100644
--- a/lmdeploy/pytorch/disagg/conn/engine_conn.py
+++ b/lmdeploy/pytorch/disagg/conn/engine_conn.py
@@ -1,18 +1,24 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
 import os
-from typing import TYPE_CHECKING, Dict, List
+from typing import TYPE_CHECKING
 from urllib.parse import urlparse
 
 import zmq
 import zmq.asyncio
 
 from lmdeploy.logger import get_logger
-from lmdeploy.pytorch.disagg.conn.protocol import (DistServeCacheFreeRequest, DistServeConnectionRequest,
-                                                   DistServeConnectionResponse, DistServeConnectionStatus,
-                                                   DistServeDropConnectionRequest, DistServeEngineEndpointInfo,
-                                                   DistServeInitRequest, DistServeInitResponse,
-                                                   DistServeKVTransferEndpointInfo)
+from lmdeploy.pytorch.disagg.conn.protocol import (
+    DistServeCacheFreeRequest,
+    DistServeConnectionRequest,
+    DistServeConnectionResponse,
+    DistServeConnectionStatus,
+    DistServeDropConnectionRequest,
+    DistServeEngineEndpointInfo,
+    DistServeInitRequest,
+    DistServeInitResponse,
+    DistServeKVTransferEndpointInfo,
+)
 from lmdeploy.pytorch.engine.executor.dist_utils import find_available_port
 
 if TYPE_CHECKING:
@@ -25,9 +31,9 @@ class EngineP2PConnection:
 
     def __init__(self, engine: 'Engine'):
         self.engine: Engine = engine
-        self.p2p_conn_ctx: Dict[str, zmq.asyncio.Context] = {}
-        self.p2p_sender: Dict[str, zmq.asyncio.Socket] = {}
-        self.p2p_receiver: Dict[str, zmq.asyncio.Socket] = {}
+        self.p2p_conn_ctx: dict[str, zmq.asyncio.Context] = {}
+        self.p2p_sender: dict[str, zmq.asyncio.Socket] = {}
+        self.p2p_receiver: dict[str, zmq.asyncio.Socket] = {}
 
         self.use_unique_kvtransfer_engine = os.environ.get('LMDEPLOY_USE_UNIQUE_KVTRANSFER_ENGINE', False)
 
@@ -44,7 +50,7 @@ def p2p_initialize(self, init_request: DistServeInitRequest):
         self.p2p_sender[init_request.remote_engine_id] = sender
         self.p2p_receiver[init_request.remote_engine_id] = receiver
 
-        kvtransfer_endpoint_info: List[DistServeKVTransferEndpointInfo] = self.engine.executor.p2p_initialize(
+        kvtransfer_endpoint_info: list[DistServeKVTransferEndpointInfo] = self.engine.executor.p2p_initialize(
             init_request)
 
         return DistServeInitResponse(engine_endpoint_info=DistServeEngineEndpointInfo(zmq_address=zmq_address),
diff --git a/lmdeploy/pytorch/disagg/conn/protocol.py b/lmdeploy/pytorch/disagg/conn/protocol.py
index aa47789497..2f6f054577 100644
--- a/lmdeploy/pytorch/disagg/conn/protocol.py
+++ b/lmdeploy/pytorch/disagg/conn/protocol.py
@@ -1,11 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import enum
-from typing import List, Optional
 
 from pydantic import BaseModel
 
-from lmdeploy.pytorch.disagg.config import (DistServeEngineConfig, DistServeNVLinkConfig, DistServeRDMAConfig,
-                                            DistServeTCPConfig)
+from lmdeploy.pytorch.disagg.config import (
+    DistServeEngineConfig,
+    DistServeNVLinkConfig,
+    DistServeRDMAConfig,
+    DistServeTCPConfig,
+)
 
 
 class MigrationProtocol(enum.Enum):
@@ -39,11 +42,11 @@ class DistServeInitRequest(BaseModel):
 
     protocol: MigrationProtocol
 
-    rank: Optional[int] = None
+    rank: int | None = None
 
-    tcp_config: Optional[DistServeTCPConfig] = None
-    rdma_config: Optional[DistServeRDMAConfig] = None
-    nvlink_config: Optional[DistServeNVLinkConfig] = None
+    tcp_config: DistServeTCPConfig | None = None
+    rdma_config: DistServeRDMAConfig | None = None
+    nvlink_config: DistServeNVLinkConfig | None = None
 
 
 class DistServeEngineEndpointInfo(BaseModel):
@@ -63,14 +66,14 @@ class DistServeInitResponse(BaseModel):
     # To ensure generality (where endpoint_info can be initialization information
     # for different media such as RDMA, NVLink, etc.), we use a string (str) to
     # store this information.
-    kvtransfer_endpoint_info: List[DistServeKVTransferEndpointInfo]
+    kvtransfer_endpoint_info: list[DistServeKVTransferEndpointInfo]
 
 
 class DistServeConnectionRequest(BaseModel):
     protocol: MigrationProtocol
     remote_engine_id: str
     remote_engine_endpoint_info: DistServeEngineEndpointInfo
-    remote_kvtransfer_endpoint_info: List[DistServeKVTransferEndpointInfo]
+    remote_kvtransfer_endpoint_info: list[DistServeKVTransferEndpointInfo]
 
 
 class DistServeConnectionResponse(BaseModel):
@@ -83,7 +86,7 @@ class MigrationRequest(BaseModel):
     remote_engine_id: str
     remote_session_id: int
     remote_token_id: int
-    remote_block_ids: List[int]
+    remote_block_ids: list[int]
 
     is_dummy_prefill: bool = False
 
diff --git a/lmdeploy/pytorch/disagg/conn/proxy_conn.py b/lmdeploy/pytorch/disagg/conn/proxy_conn.py
index a07d281248..5ab9c2ff06 100644
--- a/lmdeploy/pytorch/disagg/conn/proxy_conn.py
+++ b/lmdeploy/pytorch/disagg/conn/proxy_conn.py
@@ -3,16 +3,20 @@
 import enum
 import os
 from collections import defaultdict
-from typing import Dict, Set, Tuple
 
 import aiohttp
 import requests
 
 from lmdeploy.logger import get_logger
 from lmdeploy.pytorch.disagg.config import DistServeEngineConfig, EngineRole
-from lmdeploy.pytorch.disagg.conn.protocol import (DistServeCacheFreeRequest, DistServeConnectionRequest,
-                                                   DistServeConnectionResponse, DistServeDropConnectionRequest,
-                                                   DistServeInitRequest, DistServeInitResponse)
+from lmdeploy.pytorch.disagg.conn.protocol import (
+    DistServeCacheFreeRequest,
+    DistServeConnectionRequest,
+    DistServeConnectionResponse,
+    DistServeDropConnectionRequest,
+    DistServeInitRequest,
+    DistServeInitResponse,
+)
 from lmdeploy.pytorch.disagg.messages import PDConnectionMessage
 
 logger = get_logger('lmdeploy')
@@ -65,19 +69,19 @@ class PDConnectionPool:
     def __init__(self):
         # all prefill and decode instances
         # TODO (JimyMa): Maybe encoding instances
-        self.prefill_endpoints: Set[str] = set()
-        self.decode_endpoints: Set[str] = set()
+        self.prefill_endpoints: set[str] = set()
+        self.decode_endpoints: set[str] = set()
 
         # Links of PD Connection.
-        self.pool: Dict[Tuple[str, str], PDConnectionState] = {}
+        self.pool: dict[tuple[str, str], PDConnectionState] = {}
 
         # put migrating session to `self.migration_session_shelf` for increasing fault tolerance
         # if a session is finished, then pop it from `self.migration_session_shelf`
         # if a decode instance is disconnected, then gc all blocks of these sessions in prefill instance.
-        self.migration_session_shelf: Dict[str, Set[int]] = defaultdict(set)
+        self.migration_session_shelf: dict[str, set[int]] = defaultdict(set)
 
         # conn_perform handler queue
-        self.waiting_conn: asyncio.Queue[Tuple[PDConnectionMessage, asyncio.Event]] = (asyncio.Queue())
+        self.waiting_conn: asyncio.Queue[tuple[PDConnectionMessage, asyncio.Event]] = (asyncio.Queue())
 
         # conn Registry Lock
         self.conn_lock = asyncio.Lock()
@@ -112,10 +116,10 @@ def dereg_instance(self, endpoint: str):
             # TODO(JimyMa): handle side-effect by kvcache migration
             self.decode_endpoints.remove(endpoint)
 
-    def shelf_prefill_session(self, conn_key: Tuple[str, str], session_id: int):
+    def shelf_prefill_session(self, conn_key: tuple[str, str], session_id: int):
         self.migration_session_shelf[conn_key].add(session_id)
 
-    def unshelf_prefill_session(self, conn_key: Tuple[str, str], session_id: int):
+    def unshelf_prefill_session(self, conn_key: tuple[str, str], session_id: int):
         self.migration_session_shelf[conn_key].remove(session_id)
 
     async def connect(self, conn_req: PDConnectionMessage):
@@ -264,11 +268,11 @@ def is_connected(self, p_url: str, d_url: str):
             return False
         return link.status == PDConnectionStatus.Connected
 
-    def drop(self, pd_key: Tuple[str, str]):
+    def drop(self, pd_key: tuple[str, str]):
         left = pd_key[0]
         right = pd_key[1]
 
-        def cache_free(server_endpoint, cache_free_request: DistServeCacheFreeRequest) -> Dict:
+        def cache_free(server_endpoint, cache_free_request: DistServeCacheFreeRequest) -> dict:
             try:
                 requests.post(get_server_api(server_endpoint, 'distserve/free_cache'),
                               json=cache_free_request.model_dump(mode='json'))
diff --git a/lmdeploy/pytorch/disagg/messages.py b/lmdeploy/pytorch/disagg/messages.py
index cc29c67b8e..e4c7c0cf36 100644
--- a/lmdeploy/pytorch/disagg/messages.py
+++ b/lmdeploy/pytorch/disagg/messages.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Optional, Tuple
 
 from pydantic import BaseModel
 
@@ -11,7 +10,7 @@ class MigrationExecutionBatch(BaseModel):
     """Input of the Migration."""
 
     protocol: MigrationProtocol
-    requests: List[Tuple[str, List[Tuple[int, int]]]] = []
+    requests: list[tuple[str, list[tuple[int, int]]]] = []
 
 
 class AssignmentInstruct(BaseModel):
@@ -26,16 +25,16 @@ class MigrationAssignment(BaseModel):
     """Migration Assignment."""
     protocol: MigrationProtocol
     remote_engine_id: str
-    batch: List[AssignmentInstruct]
+    batch: list[AssignmentInstruct]
 
 
 class PDConnectionMessage(BaseModel):
     p_url: str
     d_url: str
     protocol: MigrationProtocol = MigrationProtocol.RDMA
-    tcp_config: Optional[DistServeTCPConfig] = None
-    rdma_config: Optional[DistServeRDMAConfig] = None
-    nvlink_config: Optional[DistServeNVLinkConfig] = None
+    tcp_config: DistServeTCPConfig | None = None
+    rdma_config: DistServeRDMAConfig | None = None
+    nvlink_config: DistServeNVLinkConfig | None = None
 
 
 class DistServeRegisterMRMessage(BaseModel):
diff --git a/lmdeploy/pytorch/distributed.py b/lmdeploy/pytorch/distributed.py
index ccdaa62060..7203c7883e 100644
--- a/lmdeploy/pytorch/distributed.py
+++ b/lmdeploy/pytorch/distributed.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from dataclasses import dataclass
 from datetime import timedelta
-from typing import List, Optional
 
 import torch
 from torch import distributed as dist
@@ -18,8 +17,8 @@ class DistGroup:
     rank: int = 0
     cpu_group: dist.ProcessGroup = None
     gpu_group: dist.ProcessGroup = None
-    cpu_groups: List[dist.ProcessGroup] = None
-    gpu_groups: List[dist.ProcessGroup] = None
+    cpu_groups: list[dist.ProcessGroup] = None
+    gpu_groups: list[dist.ProcessGroup] = None
     gpu_gather_group: dist.ProcessGroup = None
 
     def close(self):
@@ -197,7 +196,7 @@ class DistContext:
 
     cpu_group: dist.ProcessGroup = None
     ep_gpu_group: dist.ProcessGroup = None
-    ep_gpu_groups: List[dist.ProcessGroup] = None
+    ep_gpu_groups: list[dist.ProcessGroup] = None
     dist_config: DistConfig = None
 
     @classmethod
@@ -303,7 +302,7 @@ def get_world_rank():
     return world_size, rank
 
 
-def get_tp_world_rank(layer_type: Optional[str] = None):
+def get_tp_world_rank(layer_type: str | None = None):
     ctx = get_dist_manager().current_context()
     if layer_type is None:
         return ctx.dist_config.tp, ctx.tp_group.rank
@@ -416,8 +415,8 @@ def reduce_scatter(output, input_list, op=ReduceOp.SUM, group='tp', async_op=Fal
 
 
 def gather_by_tp_sizes(x: torch.Tensor,
-                       tp_sizes: List[int],
-                       group: Optional[dist.ProcessGroup] = None,
+                       tp_sizes: list[int],
+                       group: dist.ProcessGroup | None = None,
                        async_op: bool = False):
     """Gather input."""
     assert all(size >= 0 for size in tp_sizes), f'Invalid tp sizes: {tp_sizes}'
@@ -430,7 +429,7 @@ def gather_by_tp_sizes(x: torch.Tensor,
     return new_x
 
 
-def reduce_scatter_by_tp_sizes(out: torch.Tensor, rank: int, tp_sizes: List[int], group: dist.ProcessGroup):
+def reduce_scatter_by_tp_sizes(out: torch.Tensor, rank: int, tp_sizes: list[int], group: dist.ProcessGroup):
     """Reduce scatter."""
     attn_tp = get_dist_manager().current_config().attn_tp
     outs = list(out.split(tp_sizes, -2))
diff --git a/lmdeploy/pytorch/engine/base.py b/lmdeploy/pytorch/engine/base.py
index cfe34327ba..590a1ec82a 100644
--- a/lmdeploy/pytorch/engine/base.py
+++ b/lmdeploy/pytorch/engine/base.py
@@ -1,6 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from lmdeploy.pytorch.disagg.conn.protocol import (DistServeConnectionRequest, DistServeDropConnectionRequest,
-                                                   DistServeInitRequest)
+from lmdeploy.pytorch.disagg.conn.protocol import (
+    DistServeConnectionRequest,
+    DistServeDropConnectionRequest,
+    DistServeInitRequest,
+)
 
 
 class EngineBase:
diff --git a/lmdeploy/pytorch/engine/cache_engine.py b/lmdeploy/pytorch/engine/cache_engine.py
index 302465192f..475343e7d8 100644
--- a/lmdeploy/pytorch/engine/cache_engine.py
+++ b/lmdeploy/pytorch/engine/cache_engine.py
@@ -2,8 +2,9 @@
 # modify from: https://github.com/vllm-project/vllm
 import json
 import math
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Dict, List, Literal, Optional, Sequence, Tuple
+from typing import Literal
 
 import torch
 
@@ -11,13 +12,17 @@
 from lmdeploy.pytorch.disagg.backend.backend import MIGRATION_BACKENDS
 from lmdeploy.pytorch.disagg.backend.base import MigrationBackendImpl
 from lmdeploy.pytorch.disagg.conn.protocol import DistServeInitRequest, DistServeKVTransferEndpointInfo
-from lmdeploy.pytorch.disagg.messages import (AssignmentInstruct, DistServeRegisterMRMessage, MigrationAssignment,
-                                              MigrationExecutionBatch)
+from lmdeploy.pytorch.disagg.messages import (
+    AssignmentInstruct,
+    DistServeRegisterMRMessage,
+    MigrationAssignment,
+    MigrationExecutionBatch,
+)
 from lmdeploy.utils import get_logger
 
 from ..config import CacheConfig, ModelConfig
 
-KVCache = Tuple[torch.Tensor, torch.Tensor]
+KVCache = tuple[torch.Tensor, torch.Tensor]
 
 logger = get_logger('lmdeploy')
 
@@ -30,7 +35,7 @@ def round_up(x: int, alignment: int) -> int:
 @dataclass
 class CacheDesc:
     """Cache description."""
-    shape: List[int]
+    shape: list[int]
     dtype: torch.dtype
     alignment: int = 256
 
@@ -98,7 +103,7 @@ def __init__(
         self.local_gpu_cache = self.allocate_gpu_cache()
         self.local_cpu_cache = self.allocate_cpu_cache()
 
-        self.migration_backend_impl: Optional[MigrationBackendImpl] = None
+        self.migration_backend_impl: MigrationBackendImpl | None = None
 
         # Initialize the stream for caching operations.
         self.cache_stream = cache_stream or torch.cuda.Stream()
@@ -238,7 +243,7 @@ def get_quant_cache_descs(cls, k_cache_desc: CacheDesc, v_cache_desc: CacheDesc,
         return [key_scale_zero_desc, val_scale_zero_desc]
 
     @classmethod
-    def get_custom_cache_descs(cls, model_config: ModelConfig, cache_config: CacheConfig) -> List[CacheDesc]:
+    def get_custom_cache_descs(cls, model_config: ModelConfig, cache_config: CacheConfig) -> list[CacheDesc]:
         """Get custom cache descs."""
         if len(model_config.cache_shapes) == 0:
             return []
@@ -310,7 +315,7 @@ def allocate_cpu_cache(self):
         return self.local_cpu_cache
 
     @staticmethod
-    def get_custom_cache_shape_impl(num_layers: int, num_blocks: int, block_size: int, shape: List[int]):
+    def get_custom_cache_shape_impl(num_layers: int, num_blocks: int, block_size: int, shape: list[int]):
         """Get single block shape."""
         return (num_layers, num_blocks, block_size, *shape)
 
@@ -335,13 +340,13 @@ def allocate_custom_cache(self, device: str):
         return custom_caches
 
     @torch.inference_mode()
-    def _swap(self, src: List[torch.Tensor], dst: List[torch.Tensor], src_to_dst: Dict[int, int]):
+    def _swap(self, src: list[torch.Tensor], dst: list[torch.Tensor], src_to_dst: dict[int, int]):
         """Move caches from src memory to dst memory.
 
         Args:
-            src (List[KVCache]): Source cache.
-            dst (List[KVCache]): Destination cache.
-            src_to_dst (Dict[int, int]): Map between src and dst.
+            src (list[KVCache]): Source cache.
+            dst (list[KVCache]): Destination cache.
+            src_to_dst (dict[int, int]): Map between src and dst.
         """
         BLOCKS_PER_COPY = 2
         num_copy = len(src_to_dst)
@@ -357,19 +362,19 @@ def _swap(self, src: List[torch.Tensor], dst: List[torch.Tensor], src_to_dst: Di
                     dcache.index_copy_(1, didx, sdata.to(dcache.device))
             self.events.record(stream=self.cache_stream)
 
-    def swap_in(self, src_to_dst: Dict[int, int]) -> None:
+    def swap_in(self, src_to_dst: dict[int, int]) -> None:
         """Move cache from Host to Device.
 
         Args:
-            src_to_dst (Dict[int, int]): Map between src and dst.
+            src_to_dst (dict[int, int]): Map between src and dst.
         """
         self._swap([self.full_cpu_cache], [self.full_gpu_cache], src_to_dst)
 
-    def swap_out(self, src_to_dst: Dict[int, int]) -> None:
+    def swap_out(self, src_to_dst: dict[int, int]) -> None:
         """Move cache from Device to Host.
 
         Args:
-            src_to_dst (Dict[int, int]): Map between src and dst.
+            src_to_dst (dict[int, int]): Map between src and dst.
         """
         self._swap([self.full_gpu_cache], [self.full_cpu_cache], src_to_dst)
 
@@ -417,7 +422,7 @@ def p2p_initialize(self, migration_init_request: DistServeInitRequest) -> DistSe
                                                        migration_init_request.remote_engine_id,
                                                        migration_init_request.protocol)))
 
-    def p2p_connect(self, remote_engine_id: str, migration_conn_request: List[DistServeKVTransferEndpointInfo]):
+    def p2p_connect(self, remote_engine_id: str, migration_conn_request: list[DistServeKVTransferEndpointInfo]):
         self.migration_backend_impl.p2p_connect(remote_engine_id, migration_conn_request[self.tp_rank])
 
     async def migrate(self, migration_execution_inputs: MigrationExecutionBatch):
@@ -434,7 +439,7 @@ def get_assignment_batch(mr_key, block_ids, assignment_len, layer_stride, remote
                 for block_id in block_ids
             ]
 
-        assignment_batch: List[Tuple[str, int, int, int]] = []  # mr_key, target, source, offset
+        assignment_batch: list[tuple[str, int, int, int]] = []  # mr_key, target, source, offset
         for migration_exe_req in migration_execution_inputs.requests:
             remote_engine_id = migration_exe_req[0]
             blocks_to_migration = migration_exe_req[1]
@@ -466,7 +471,7 @@ def __init__(self, cache_config: CacheConfig):
                                                                  device='cuda')
 
     @staticmethod
-    def allocate_caches(num_caches: int, state_shapes: List[Tuple[Tuple[int], torch.dtype]], device: torch.device):
+    def allocate_caches(num_caches: int, state_shapes: list[tuple[tuple[int], torch.dtype]], device: torch.device):
         """Allocate cache implement."""
 
         if len(state_shapes) == 0 or num_caches == 0:
@@ -492,11 +497,11 @@ def allocate_caches(num_caches: int, state_shapes: List[Tuple[Tuple[int], torch.
         return mem_pool, caches
 
     @staticmethod
-    def get_cache_state_size(state_shapes: List[Tuple[Tuple[int], torch.dtype]]) -> int:
+    def get_cache_state_size(state_shapes: list[tuple[tuple[int], torch.dtype]]) -> int:
         """Get the required cache size of the state cache.
 
         Args:
-            state_shapes (List[Tuple[Tuple[int], torch.dtype]]): The shapes and dtypes of the states.
+            state_shapes (list[tuple[tuple[int], torch.dtype]]): The shapes and dtypes of the states.
 
         Return:
             int: Required memory size in bytes.
diff --git a/lmdeploy/pytorch/engine/config_builder.py b/lmdeploy/pytorch/engine/config_builder.py
index b97ec46d32..7c7ab6c3d0 100644
--- a/lmdeploy/pytorch/engine/config_builder.py
+++ b/lmdeploy/pytorch/engine/config_builder.py
@@ -3,8 +3,14 @@
 import os
 
 from lmdeploy.messages import PytorchEngineConfig, SpeculativeConfig
-from lmdeploy.pytorch.config import (BackendConfig, CacheConfig, DistConfig, MiscConfig, SchedulerConfig,
-                                     SpecDecodeConfig)
+from lmdeploy.pytorch.config import (
+    BackendConfig,
+    CacheConfig,
+    DistConfig,
+    MiscConfig,
+    SchedulerConfig,
+    SpecDecodeConfig,
+)
 from lmdeploy.utils import get_logger, get_max_batch_size, get_model
 
 
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
index 71f061f5ef..b7d3bd7f84 100644
--- a/lmdeploy/pytorch/engine/engine.py
+++ b/lmdeploy/pytorch/engine/engine.py
@@ -3,7 +3,7 @@
 import gc
 import os
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union
+from typing import Any
 
 import numpy as np
 import torch
@@ -11,8 +11,11 @@
 from lmdeploy.messages import PytorchEngineConfig, RequestMetrics, ResponseType, SpeculativeConfig
 from lmdeploy.pytorch.disagg.config import EngineRole
 from lmdeploy.pytorch.disagg.conn.engine_conn import EngineP2PConnection
-from lmdeploy.pytorch.disagg.conn.protocol import (DistServeConnectionRequest, DistServeDropConnectionRequest,
-                                                   DistServeInitRequest)
+from lmdeploy.pytorch.disagg.conn.protocol import (
+    DistServeConnectionRequest,
+    DistServeDropConnectionRequest,
+    DistServeInitRequest,
+)
 from lmdeploy.utils import get_logger, get_model
 
 from ..adapter.adapter import AdapterManager
@@ -28,7 +31,7 @@
 
 logger = get_logger('lmdeploy')
 
-SeqList = List[SchedulerSequence]
+SeqList = list[SchedulerSequence]
 
 
 @dataclass
@@ -37,7 +40,7 @@ class InferOutput:
 
     session_id: int
     resp: Response
-    token_ids: Union[np.ndarray, List[int]]
+    token_ids: np.ndarray | list[int]
     meta: Any = None
     finish: bool = False
     logits: torch.Tensor = None
@@ -45,7 +48,7 @@ class InferOutput:
 
     # send cache blocks back for migration in Disaggregated LLM Serving
     # when Prefill Engine is Done.
-    cache_block_ids: List[int] = None
+    cache_block_ids: list[int] = None
 
     # for logging
     req_metrics: RequestMetrics = None
@@ -229,7 +232,7 @@ def from_pretrained(cls,
             speculative_config=speculative_config,
         )
 
-    def _download_adapters(self, adapters: Dict[str, str], engine_config: PytorchEngineConfig):
+    def _download_adapters(self, adapters: dict[str, str], engine_config: PytorchEngineConfig):
         """Download adapters."""
         download_dir = engine_config.download_dir
         revision = engine_config.revision
@@ -274,7 +277,7 @@ def _get_max_session_len(self):
             session_len = min(max_tokens, session_len)
         return session_len
 
-    def _on_add_session(self, reqs: List[Request], **kwargs):
+    def _on_add_session(self, reqs: list[Request], **kwargs):
         """On add session callback."""
         for req in reqs:
             session_id = req.data['session_id']
@@ -286,7 +289,7 @@ def _on_add_session(self, reqs: List[Request], **kwargs):
             if resp:
                 self._response(req.resp, resp_type)
 
-    def _on_stop_session(self, reqs: List[Request], **kwargs):
+    def _on_stop_session(self, reqs: list[Request], **kwargs):
         """On stop session callback."""
         for req in reqs:
             session_id = req.data['session_id']
@@ -305,7 +308,7 @@ def _on_stop_session(self, reqs: List[Request], **kwargs):
             if resp:
                 self._response(req.resp, resp_type)
 
-    def _on_end_session(self, reqs: List[Request], **kwargs):
+    def _on_end_session(self, reqs: list[Request], **kwargs):
         """On end session callback."""
         for req in reqs:
             session_id = req.data['session_id']
@@ -321,7 +324,7 @@ def _on_end_session(self, reqs: List[Request], **kwargs):
             if resp:
                 self._response(req.resp, resp_type)
 
-    def _on_add_message(self, reqs: List[Request], **kwargs):
+    def _on_add_message(self, reqs: list[Request], **kwargs):
         """On add message callback."""
         valid_reqs = []
         for req in reqs:
@@ -359,7 +362,7 @@ def _on_add_message(self, reqs: List[Request], **kwargs):
         if len(valid_reqs) > 0:
             self._add_message(valid_reqs)
 
-    def _add_message(self, reqs: List[Request]):
+    def _add_message(self, reqs: list[Request]):
 
         def __update_max_new_tokens(msg):
             """Update max new tokens."""
@@ -440,7 +443,7 @@ def sleep(self, level: int = 1):
         """Sleep."""
         self.executor.sleep(level)
 
-    def wakeup(self, tags: Optional[List[str]] = None):
+    def wakeup(self, tags: list[str] | None = None):
         """Wakeup."""
         self.executor.wakeup(tags)
 
diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py
index c85975425a..c2bbc03420 100644
--- a/lmdeploy/pytorch/engine/engine_instance.py
+++ b/lmdeploy/pytorch/engine/engine_instance.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Dict, List
+from typing import Any
 
 from lmdeploy.messages import EngineOutput, GenerationConfig
 from lmdeploy.utils import get_logger
@@ -11,7 +11,7 @@
 
 logger = get_logger('lmdeploy')
 
-InputMultiModalType = List[Dict[str, Any]]
+InputMultiModalType = list[dict[str, Any]]
 
 
 def _check_resp(resp: Response, state: ResponseType, warning_msg: str = None):
@@ -125,7 +125,7 @@ def _try_add_session(self, session_id: int):
 
     async def async_stream_infer(self,
                                  session_id: int,
-                                 input_ids: List[int],
+                                 input_ids: list[int],
                                  gen_config: GenerationConfig = None,
                                  multimodal: InputMultiModalType = None,
                                  adapter_name: str = None,
@@ -134,13 +134,13 @@ async def async_stream_infer(self,
 
         Args:
             session_id (int): The session id.
-            input_ids (List[int]): The input token ids.
+            input_ids (list[int]): The input token ids.
             gen_config (GenerationConfig): The sampling parameters.
             adapter_name (str): The lora adapter name.
 
         Yields:
             int: Error flags. 0 if success.
-            List[int]: The streaming output tokens.
+            list[int]: The streaming output tokens.
             int: The number of the output tokens.
         """
         if len(input_ids) > self.max_input_len:
@@ -210,7 +210,7 @@ async def async_stream_infer(self,
 
     async def async_infer(self,
                           session_id: int,
-                          input_ids: List[int] = None,
+                          input_ids: list[int] = None,
                           multimodal: InputMultiModalType = None,
                           gen_config: GenerationConfig = None,
                           **kwargs):
@@ -218,12 +218,12 @@ async def async_infer(self,
 
         Args:
             session_id (int): The session id.
-            input_ids (List[int]): The input token ids.
+            input_ids (list[int]): The input token ids.
             gen_config (GenerationConfig): The sampling parameters.
 
         Returns:
             int: Error flags. 0 if success.
-            List[int]: The streaming output tokens.
+            list[int]: The streaming output tokens.
             int: The number of the output tokens.
         """
         async for outputs in self.async_stream_infer(session_id,
@@ -239,7 +239,7 @@ async def async_infer(self,
 
     def stream_infer(self,
                      session_id: int,
-                     input_ids: List[int],
+                     input_ids: list[int],
                      multimodal: InputMultiModalType = None,
                      gen_config: GenerationConfig = None,
                      adapter_name: str = None,
@@ -248,13 +248,13 @@ def stream_infer(self,
 
         Args:
             session_id (int): The session id.
-            input_ids (List[int]): The input token ids.
+            input_ids (list[int]): The input token ids.
             gen_config (GenerationConfig): The sampling parameters.
             adapter_name (str): The lora adapter name.
 
         Yields:
             int: Error flags. 0 if success.
-            List[int]: The streaming output tokens.
+            list[int]: The streaming output tokens.
             int: The number of the output tokens.
         """
 
@@ -276,7 +276,7 @@ def __call_async():
 
     def infer(self,
               session_id: int,
-              input_ids: List[int] = None,
+              input_ids: list[int] = None,
               multimodal: InputMultiModalType = None,
               gen_config: GenerationConfig = None,
               **kwargs):
@@ -284,12 +284,12 @@ def infer(self,
 
         Args:
             session_id (int): The session id.
-            input_ids (List[int]): The input token ids.
+            input_ids (list[int]): The input token ids.
             gen_config (GenerationConfig): The sampling parameters.
 
         Returns:
             int: Error flags. 0 if success.
-            List[int]: The streaming output tokens.
+            list[int]: The streaming output tokens.
             int: The number of the output tokens.
         """
         return self.req_sender.run_until_complete(
diff --git a/lmdeploy/pytorch/engine/engine_loop.py b/lmdeploy/pytorch/engine/engine_loop.py
index d0b6a5e2d6..72956611a4 100644
--- a/lmdeploy/pytorch/engine/engine_loop.py
+++ b/lmdeploy/pytorch/engine/engine_loop.py
@@ -3,7 +3,7 @@
 import logging
 import time
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple
+from typing import TYPE_CHECKING, Any, Optional
 
 import numpy as np
 import torch
@@ -83,7 +83,7 @@ class EngineLoopConfig:
     This config is added for Dependency Injection
     """
     role: EngineRole
-    num_speculative_tokens: Optional[int] = None
+    num_speculative_tokens: int | None = None
     enable_metrics: bool = False
     enable_transfer_obj_ref: bool = False
 
@@ -123,7 +123,7 @@ def __init__(self,
         self.engine_conn = engine_conn
 
         # tasks and control events
-        self.tasks: Set[asyncio.Task] = set()
+        self.tasks: set[asyncio.Task] = set()
         self.stop_event = asyncio.Event()
         self.resp_queue = asyncio.Queue()
         self.forward_event = CounterEvent()
@@ -141,7 +141,7 @@ async def preprocess_loop(self):
             self.has_runable_event.set()
 
     @staticmethod
-    def _log_resps(outputs: List[InferOutput]):
+    def _log_resps(outputs: list[InferOutput]):
         """Log resps."""
         if logger.level <= logging.DEBUG:
             session_ids = [out.session_id for out in outputs]
@@ -166,7 +166,7 @@ def _send_resp(self, out: InferOutput):
                                 logprobs=logprobs))
 
     @staticmethod
-    def _update_logprobs(step_outputs: List[InferOutput]):
+    def _update_logprobs(step_outputs: list[InferOutput]):
         for out in step_outputs:
             cur_logprobs = out.logprobs
             if cur_logprobs is None:
@@ -183,7 +183,7 @@ def _update_logprobs(step_outputs: List[InferOutput]):
             logprobs = out.resp.data['logprobs']
             logprobs.append(cur_logprobs)
 
-    def _send_resps(self, step_outputs: List[InferOutput]):
+    def _send_resps(self, step_outputs: list[InferOutput]):
         """Send response callback."""
         self._log_resps(step_outputs)
         self._update_logprobs(step_outputs)
@@ -218,7 +218,7 @@ def _make_infer_outputs(
     ):
         """Make infer output."""
 
-        def __get_logit(msg, logits: torch.Tensor, seq_length: List[int], idx: int):
+        def __get_logit(msg, logits: torch.Tensor, seq_length: list[int], idx: int):
             logit = logits.split(seq_length)[idx]
             if len(msg.all_logits) > 0:
                 # for chunked long context
@@ -253,7 +253,7 @@ def __get_logit(msg, logits: torch.Tensor, seq_length: List[int], idx: int):
                                          delta=delta)
 
         # generate output
-        outputs: Dict[int, InferOutput] = dict()
+        outputs: dict[int, InferOutput] = dict()
         for idx, msg in enumerate(running):
             if not is_run[idx]:
                 continue
@@ -310,7 +310,7 @@ async def _main_loop_try_send_next_inputs(self):
     async def _main_loop_get_outputs(
         self,
         running: 'SeqList',
-        forward_inputs: Dict[str, Any],
+        forward_inputs: dict[str, Any],
     ):
         """Get outputs and prefetch."""
         model_inputs = forward_inputs['inputs']
@@ -363,7 +363,7 @@ async def __no_running_warning():
             has_runable_event.set()
 
     def update_running_migration(self, running: 'SeqList', next_token_ids: np.ndarray, stopped: torch.Tensor,
-                                 model_metas: List[Dict[str, Any]]):
+                                 model_metas: list[dict[str, Any]]):
         """Update scheduler."""
         if model_metas is None:
             model_metas = [None] * len(running)
@@ -386,7 +386,7 @@ async def _migration_loop_migrate(self, migration_ready: 'SeqList'):
             if msg.migration_request.is_dummy_prefill:
                 continue
 
-            migration_execution_requests: List[Tuple[int, List[Tuple[int, int]]]] = []
+            migration_execution_requests: list[tuple[int, list[tuple[int, int]]]] = []
             migration_request = msg.migration_request
             prefill_block_ids = migration_request.remote_block_ids
             decode_block_ids = list(self.scheduler.block_manager.get_block_table(msg=msg))
@@ -409,7 +409,7 @@ async def _migration_loop_migrate(self, migration_ready: 'SeqList'):
 
     async def _migration_loop_get_outputs(self, migration_ready: 'SeqList'):
         """Migration loop get outputs."""
-        outputs: Dict[int, InferOutput] = dict()
+        outputs: dict[int, InferOutput] = dict()
         for _, msg in enumerate(migration_ready):
             session_id = msg.session_id
             msg.resp.type = ResponseType.SUCCESS
diff --git a/lmdeploy/pytorch/engine/executor/__init__.py b/lmdeploy/pytorch/engine/executor/__init__.py
index 96c624efb1..a2f0e7e91d 100644
--- a/lmdeploy/pytorch/engine/executor/__init__.py
+++ b/lmdeploy/pytorch/engine/executor/__init__.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from logging import Logger
-from typing import Dict
 
 from lmdeploy.pytorch import envs
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, DistConfig, MiscConfig, ModelConfig, SpecDecodeConfig
@@ -59,7 +58,7 @@ def build_executor(
     backend_config: BackendConfig,
     dist_config: DistConfig,
     misc_config: MiscConfig,
-    adapters: Dict[str, str] = None,
+    adapters: dict[str, str] = None,
     device_type: str = 'cuda',
     distributed_executor_backend: str = None,
     dtype: str = 'auto',
diff --git a/lmdeploy/pytorch/engine/executor/base.py b/lmdeploy/pytorch/engine/executor/base.py
index b5d560e5a1..77873c2ac5 100644
--- a/lmdeploy/pytorch/engine/executor/base.py
+++ b/lmdeploy/pytorch/engine/executor/base.py
@@ -2,7 +2,7 @@
 # Inspired by vLLM: https://github.com/vllm-project/vllm
 import asyncio
 import contextlib
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, DistConfig, MiscConfig, ModelConfig, SpecDecodeConfig
 from lmdeploy.pytorch.disagg.conn.protocol import DistServeInitRequest, DistServeKVTransferEndpointInfo
@@ -23,7 +23,7 @@ def __init__(self,
                  backend_config: BackendConfig,
                  dist_config: DistConfig,
                  misc_config: MiscConfig,
-                 adapters: Dict[str, str] = None,
+                 adapters: dict[str, str] = None,
                  specdecode_config: SpecDecodeConfig = None,
                  device_type: str = 'cuda'):
         """Initialize Executor."""
@@ -78,7 +78,7 @@ async def sleep(self, level: int = 1):
         """Sleep."""
         raise NotImplementedError('Not Implemented.')
 
-    def wakeup(self, tags: Optional[List[str]] = None):
+    def wakeup(self, tags: list[str] | None = None):
         """Wakeup."""
         raise NotImplementedError('Not Implemented.')
 
@@ -120,7 +120,7 @@ def p2p_initialize(self, remote_engine_config: DistServeInitRequest):
         """Init rdma link."""
         raise NotImplementedError('Not implemented')
 
-    def p2p_connect(self, conn_request: List[DistServeKVTransferEndpointInfo]):
+    def p2p_connect(self, conn_request: list[DistServeKVTransferEndpointInfo]):
         """rdma_connect."""
         raise NotImplementedError('Not Implemented')
 
diff --git a/lmdeploy/pytorch/engine/executor/base_worker.py b/lmdeploy/pytorch/engine/executor/base_worker.py
index 40ace7defc..d78ab9867f 100644
--- a/lmdeploy/pytorch/engine/executor/base_worker.py
+++ b/lmdeploy/pytorch/engine/executor/base_worker.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
 import gc
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from lmdeploy.pytorch.backends.selector import get_backend
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, DistConfig, MiscConfig, ModelConfig, SpecDecodeConfig
@@ -28,7 +28,7 @@ def __init__(
         model_config: ModelConfig,
         dist_config: DistConfig,
         misc_config: MiscConfig,
-        adapters: Dict[str, str] = None,
+        adapters: dict[str, str] = None,
         device_type: str = 'cuda',
         log_level: int = 30,
         specdecode_config: SpecDecodeConfig = None,
@@ -66,7 +66,7 @@ def init_process_group(self, rank: int, master_addr: str = None, master_port: st
         ccl_backend = get_backend(self.device_type).ccl_backend()
         self.dist_ctx = DistContext.build(self.rank, self.dist_config, ccl_backend)
 
-    def pack_output(self, output: Dict):
+    def pack_output(self, output: dict):
         """Pack output."""
         return output
 
@@ -123,7 +123,7 @@ async def sleep(self, level: int = 1):
         """Sleep."""
         await self.model_agent.sleep(level)
 
-    def wakeup(self, tags: Optional[List[str]] = None):
+    def wakeup(self, tags: list[str] | None = None):
         """Wakeup."""
         self.model_agent.wakeup(tags)
 
@@ -175,7 +175,7 @@ def release(self):
     def p2p_initialize(self, init_request: DistServeInitRequest):
         return self.model_agent.cache_engine.p2p_initialize(init_request)
 
-    def p2p_connect(self, remote_engine_id: str, conn_request: List[DistServeKVTransferEndpointInfo]):
+    def p2p_connect(self, remote_engine_id: str, conn_request: list[DistServeKVTransferEndpointInfo]):
         return self.model_agent.cache_engine.p2p_connect(remote_engine_id, conn_request)
 
     async def migrate(self, inputs: MigrationExecutionBatch):
diff --git a/lmdeploy/pytorch/engine/executor/mp_executor.py b/lmdeploy/pytorch/engine/executor/mp_executor.py
index e53d39ef30..9f457eec3c 100644
--- a/lmdeploy/pytorch/engine/executor/mp_executor.py
+++ b/lmdeploy/pytorch/engine/executor/mp_executor.py
@@ -8,7 +8,7 @@
 import struct
 from contextlib import asynccontextmanager, contextmanager
 from multiprocessing.context import SpawnContext
-from typing import Any, Dict, List, Tuple
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -224,7 +224,7 @@ def __init__(self,
                  backend_config: BackendConfig,
                  dist_config: DistConfig,
                  misc_config: MiscConfig,
-                 adapters: Dict[str, str] = None,
+                 adapters: dict[str, str] = None,
                  specdecode_config: SpecDecodeConfig = None,
                  device_type: str = 'cuda'):
         """Initialize Executor."""
@@ -247,8 +247,8 @@ def __init__(self,
         self.comm_buf_name = self.comm_buf.name()
 
         logger.info('Creating processes.')
-        self.procs: List[ExecutorProc] = []
-        self.ret_bufs: List[SharedBuffer] = []
+        self.procs: list[ExecutorProc] = []
+        self.ret_bufs: list[SharedBuffer] = []
         for proc_id in range(self.world_size):
             proc = ExecutorProc(proc_id=proc_id, mp_ctx=mp_ctx)
 
@@ -285,8 +285,8 @@ def signal_handler(signum, frame):
 
     def collective_rpc(self,
                        method: str,
-                       args: Tuple[Any] = None,
-                       kwargs: Dict[str, Any] = None,
+                       args: tuple[Any] = None,
+                       kwargs: dict[str, Any] = None,
                        receiver_mask: int = 0xff,
                        return_mask: int = 0xff):
         """Collective rpc."""
@@ -314,8 +314,8 @@ def collective_rpc(self,
 
     async def collective_rpc_async(self,
                                    method: str,
-                                   args: Tuple[Any] = None,
-                                   kwargs: Dict[str, Any] = None,
+                                   args: tuple[Any] = None,
+                                   kwargs: dict[str, Any] = None,
                                    receiver_mask: int = 0xff,
                                    return_mask: int = 0xff):
         """Collective rpc."""
@@ -433,7 +433,7 @@ def __init__(
         dist_config: DistConfig,
         misc_config: MiscConfig,
         specdecode_config: SpecDecodeConfig = None,
-        adapters: Dict[str, str] = None,
+        adapters: dict[str, str] = None,
         device_type: str = 'cuda',
         log_level: int = 30,
     ):
@@ -496,7 +496,7 @@ def _main_loop(
         dist_config: DistConfig,
         misc_config: MiscConfig,
         specdecode_config: SpecDecodeConfig = None,
-        adapters: Dict[str, str] = None,
+        adapters: dict[str, str] = None,
         device_type: str = 'cuda',
         log_level: int = 30,
     ):
@@ -554,7 +554,7 @@ def handle_sigterm(signum, frame):
                 dist.destroy_process_group()
 
     @staticmethod
-    async def _task_wrapper(func, args: List, kwargs: Dict, need_return: bool, ret_buf: SharedBuffer):
+    async def _task_wrapper(func, args: list, kwargs: dict, need_return: bool, ret_buf: SharedBuffer):
         ret = await func(*args, **kwargs)
         if need_return:
             await ret_buf.send_async(ret)
diff --git a/lmdeploy/pytorch/engine/executor/ray_executor.py b/lmdeploy/pytorch/engine/executor/ray_executor.py
index 52eb463e15..96bd489fd1 100644
--- a/lmdeploy/pytorch/engine/executor/ray_executor.py
+++ b/lmdeploy/pytorch/engine/executor/ray_executor.py
@@ -3,7 +3,7 @@
 import contextlib
 import json
 import os
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any
 
 import ray
 import ray.exceptions
@@ -50,12 +50,12 @@ def get_ascend_device_rank_mapping(master_addr):
     rank_table_file = _envs.ascend_rank_table_file
     if not rank_table_file:
         raise ValueError('ASCEND_RANK_TABLE_FILE_PATH is not set')
-    with open(rank_table_file, 'r') as f:
+    with open(rank_table_file) as f:
         rank_table = json.load(f)
     try:
         assert master_addr == rank_table['server_list'][0]['server_id'], 'Master address does not match rank table'
-        rank_mapping: Dict[int, int] = {}
-        worker_ip_by_rank: Dict[int, str] = {}
+        rank_mapping: dict[int, int] = {}
+        worker_ip_by_rank: dict[int, str] = {}
         for server in rank_table['server_list']:
             node_ip = server['server_id']
             for idx, device in enumerate(server['device']):
@@ -82,7 +82,7 @@ def get_ascend_device_rank_mapping(master_addr):
     return rank_mapping, worker_ips, envs
 
 
-def _update_env_cuda_alloc_conf(env_vars: Dict):
+def _update_env_cuda_alloc_conf(env_vars: dict):
     """Update runtime env for CUDA alloc conf."""
     cuda_alloc_conf = os.getenv('PYTORCH_CUDA_ALLOC_CONF', None)
     if cuda_alloc_conf is None:
@@ -105,17 +105,17 @@ def _update_env_cuda_alloc_conf(env_vars: Dict):
     env_vars['PYTORCH_CUDA_ALLOC_CONF'] = cuda_alloc_conf
 
 
-def _update_runtime_envs(runtime_env: Dict):
+def _update_runtime_envs(runtime_env: dict):
     """Update runtime envs."""
     new_envs = _envs.get_all_envs()
-    env_vars: Dict = runtime_env.get('env_vars', {})
+    env_vars: dict = runtime_env.get('env_vars', {})
     env_vars.update(new_envs)
     _update_env_cuda_alloc_conf(env_vars)
     runtime_env['env_vars'] = env_vars
     return runtime_env
 
 
-def _update_runtime_env_nsys(runtime_env: Dict):
+def _update_runtime_env_nsys(runtime_env: dict):
     """Update runtime env for nsys."""
     nsight_env = {
         't': 'cuda,cudnn,cublas,nvtx',
@@ -163,7 +163,7 @@ def __init__(
         model_config: ModelConfig,
         dist_config: DistConfig,
         misc_config: MiscConfig,
-        adapters: Dict[str, str] = None,
+        adapters: dict[str, str] = None,
         device_type: str = 'cuda',
         dtype: str = 'auto',
         log_level: int = 30,
@@ -191,7 +191,7 @@ def set_device(self, local_rank):
         """Set worker local rank."""
         torch.cuda.set_device(local_rank)
 
-    def set_env(self, envs: Dict[str, str]):
+    def set_env(self, envs: dict[str, str]):
         for key, value in envs.items():
             os.environ[key] = value
 
@@ -211,7 +211,7 @@ def warmup_dist(self):
             tmp = torch.empty((1, ), device='cuda')
             all_reduce(tmp, group=group)
 
-    def pack_output(self, output: Dict):
+    def pack_output(self, output: dict):
         """Pack output."""
         return output.to_numpy()
 
@@ -239,7 +239,7 @@ def __init__(
         backend_config: BackendConfig,
         dist_config: DistConfig,
         misc_config: MiscConfig,
-        adapters: Dict[str, str] = None,
+        adapters: dict[str, str] = None,
         device_type: str = 'cuda',
         dtype: str = 'auto',
         specdecode_config: SpecDecodeConfig = None,
@@ -311,8 +311,8 @@ def __init__(
 
     def collective_rpc(self,
                        method: str,
-                       args: Tuple[Any] = None,
-                       kwargs: Dict[str, Any] = None,
+                       args: tuple[Any] = None,
+                       kwargs: dict[str, Any] = None,
                        timeout: float = None):
         """Collective rpc."""
         if args is None:
@@ -357,7 +357,7 @@ def sleep(self, level: int = 1):
         """Sleep."""
         self.collective_rpc('sleep', (level, ))
 
-    def wakeup(self, tags: Optional[List[str]] = None):
+    def wakeup(self, tags: list[str] | None = None):
         """Wakeup."""
         if tags is None or 'kv_cache' in tags:
             self.update_configs()
@@ -514,11 +514,11 @@ def remote_log(self, msg: str):
         handle = ray.get(handle_ref)
         ray.get(self.workers[0].remote_log_end.remote(handle))
 
-    def _sort_workers(self, driver_ip: str, workers: List[RayWorkerWrapper]):
+    def _sort_workers(self, driver_ip: str, workers: list[RayWorkerWrapper]):
         """Sort workers by ip."""
         worker_ips = ray.get([worker.get_node_ip.remote() for worker in workers])
 
-        ip_counts: Dict[str, int] = {}
+        ip_counts: dict[str, int] = {}
         for ip in worker_ips:
             ip_counts[ip] = ip_counts.get(ip, 0) + 1
 
@@ -544,7 +544,7 @@ def sort_by_driver_then_worker_ip(item):
         workers = [item[0] for item in sorted_worker_ip_map]
         return workers
 
-    def _sort_workers_by_ip(self, ips, workers: List[RayWorkerWrapper]):
+    def _sort_workers_by_ip(self, ips, workers: list[RayWorkerWrapper]):
         worker_ips = ray.get([worker.get_node_ip.remote() for worker in workers])
 
         if len(ips) != len(workers):
@@ -661,7 +661,7 @@ def _init_ascend_distributed_environment(self, driver_ip):
     def p2p_initialize(self, init_request: DistServeInitRequest):
         return self.collective_rpc('p2p_initialize', (init_request, ))
 
-    def p2p_connect(self, remote_engine_id: str, conn_request: List[DistServeKVTransferEndpointInfo]):
+    def p2p_connect(self, remote_engine_id: str, conn_request: list[DistServeKVTransferEndpointInfo]):
         """Rdma connect."""
         return self.collective_rpc('p2p_connect', (
             remote_engine_id,
diff --git a/lmdeploy/pytorch/engine/executor/uni_executor.py b/lmdeploy/pytorch/engine/executor/uni_executor.py
index 423ea144a1..34c7412ee6 100644
--- a/lmdeploy/pytorch/engine/executor/uni_executor.py
+++ b/lmdeploy/pytorch/engine/executor/uni_executor.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
-from typing import Dict, List
 
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, DistConfig, MiscConfig, ModelConfig, SpecDecodeConfig
 from lmdeploy.pytorch.devices import DeviceContext
@@ -24,7 +23,7 @@ def __init__(
         cache_config: CacheConfig,
         backend_config: BackendConfig,
         misc_config: MiscConfig,
-        adapters: Dict[str, str] = None,
+        adapters: dict[str, str] = None,
         device_type: str = 'cuda',
         specdecode_config: SpecDecodeConfig = None,
     ):
@@ -122,7 +121,7 @@ def p2p_initialize(self, init_request: DistServeInitRequest):
         """
         return [self.model_agent.cache_engine.p2p_initialize(init_request)]
 
-    def p2p_connect(self, remote_engine_id: str, conn_request: List[DistServeKVTransferEndpointInfo]):
+    def p2p_connect(self, remote_engine_id: str, conn_request: list[DistServeKVTransferEndpointInfo]):
         """rdma_connect."""
         self.model_agent.cache_engine.p2p_connect(remote_engine_id, conn_request)
 
diff --git a/lmdeploy/pytorch/engine/guided_process.py b/lmdeploy/pytorch/engine/guided_process.py
index c1bf7c920f..506ebc74a9 100644
--- a/lmdeploy/pytorch/engine/guided_process.py
+++ b/lmdeploy/pytorch/engine/guided_process.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
 import logging
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any
 
 import torch
 import xgrammar as xgr
@@ -13,7 +13,7 @@
 class GuidedDecodingManager:
     processors = {}
 
-    def __init__(self, tokenizer: PreTrainedTokenizerBase, vocab_size: Optional[int]):
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, vocab_size: int | None):
         if vocab_size is None:
             vocab_size = tokenizer.vocab_size
 
@@ -21,15 +21,15 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, vocab_size: Optional[int]
         self.compiler = xgr.GrammarCompiler(tokenizer_info)
         self.vocab_size = vocab_size
 
-    def get_processors(self, session_ctx: List[Dict[str, Any]],
-                       response_formats: Tuple[Dict]) -> Dict[int, xgr.GrammarMatcher]:
+    def get_processors(self, session_ctx: list[dict[str, Any]],
+                       response_formats: tuple[dict]) -> dict[int, xgr.GrammarMatcher]:
         processors = {}
         for i, _format in enumerate(response_formats):
-            if isinstance(_format, Dict) and _format.get('type', 'text') != 'text':
+            if isinstance(_format, dict) and _format.get('type', 'text') != 'text':
                 schema_type = _format['type']
                 if schema_type == 'json_schema':
                     schema = _format['json_schema']
-                    if isinstance(schema, Dict):
+                    if isinstance(schema, dict):
                         for key in ['json_schema', 'schema']:
                             if key in schema:
                                 schema = json.dumps(schema[key], ensure_ascii=False)
diff --git a/lmdeploy/pytorch/engine/input_process.py b/lmdeploy/pytorch/engine/input_process.py
index 38e4c24e8b..7c54319802 100644
--- a/lmdeploy/pytorch/engine/input_process.py
+++ b/lmdeploy/pytorch/engine/input_process.py
@@ -1,21 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs
 
-TypeModelMetas = Dict[str, Any]
+TypeModelMetas = dict[str, Any]
 
-InputMultiModalType = List[Dict[str, Any]]
+InputMultiModalType = list[dict[str, Any]]
 
 
 @dataclass
 class PreprocessInputResult:
     """Results of preprocess input."""
-    input_ids: List[int]
-    input_multimodals: Optional[MultiModalInputs] = None
-    model_metas: Optional[TypeModelMetas] = None
+    input_ids: list[int]
+    input_multimodals: MultiModalInputs | None = None
+    model_metas: TypeModelMetas | None = None
 
 
 class BaseModelInputProcessor(ABC):
@@ -23,7 +23,7 @@ class BaseModelInputProcessor(ABC):
 
     @abstractmethod
     def preprocess_input(self,
-                         input_ids: List[int],
+                         input_ids: list[int],
                          input_mms: InputMultiModalType = None,
                          **kwargs) -> PreprocessInputResult:
         """Preprocess input."""
@@ -34,7 +34,7 @@ class DefaultModelInputProcessor(BaseModelInputProcessor):
     """Default model input processor."""
 
     def preprocess_input(self,
-                         input_ids: List[int],
+                         input_ids: list[int],
                          input_mms: MultiModalInputs = None,
                          **kwargs) -> PreprocessInputResult:
         """Preprocess input."""
diff --git a/lmdeploy/pytorch/engine/inputs_maker.py b/lmdeploy/pytorch/engine/inputs_maker.py
index 506a372250..13cdc799f4 100644
--- a/lmdeploy/pytorch/engine/inputs_maker.py
+++ b/lmdeploy/pytorch/engine/inputs_maker.py
@@ -2,7 +2,7 @@
 import logging
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Optional
 
 import numpy as np
 import torch
@@ -131,7 +131,7 @@ def next_chunk_size(self):
 
         start = seq.num_history_ids
         end = start + llm_chunk_size
-        out_multimodals: 'MultiModalInputs' = defaultdict(list)
+        out_multimodals: MultiModalInputs = defaultdict(list)
         for modal_type, mm in self.multimodal_iter():
             assert mm.start >= start, 'multimodal data should be sorted by start'
             if mm.start >= end:
@@ -158,7 +158,7 @@ def is_last_chunk(self):
 
     def clear(self):
         """Clear."""
-        self.seq: 'SchedulerSequence' = None
+        self.seq: SchedulerSequence = None
         self.multimodals: MultiModalInputs = defaultdict(list)
         self.next_step: int = 0
         self.max_prefill_num: int = self.max_prefill_token_num
@@ -219,8 +219,8 @@ def __init__(
 
         # running seqs
         # mark the seqs that have been sent to executor
-        self.running_seqs: List['SchedulerSequence'] = []
-        self.to_evict_seqs: List['SchedulerSequence'] = []
+        self.running_seqs: list[SchedulerSequence] = []
+        self.to_evict_seqs: list[SchedulerSequence] = []
 
         # long context chunker
         self.long_context_chunker = LongContextChunker(config.max_prefill_token_num)
@@ -453,8 +453,8 @@ def create_model_inputs_delta(self):
 
         valid_mask = np.array(valid_mask)
         indices_cpu = np.arange(0, batch_size)[valid_mask]
-        valid_seqs: List['SchedulerSequence'] = [self.running_seqs[i] for i in indices_cpu]
-        invalid_seqs: List['SchedulerSequence'] = [self.running_seqs[i] for i in range(batch_size) if not valid_mask[i]]
+        valid_seqs: list[SchedulerSequence] = [self.running_seqs[i] for i in indices_cpu]
+        invalid_seqs: list[SchedulerSequence] = [self.running_seqs[i] for i in range(batch_size) if not valid_mask[i]]
         if len(valid_seqs) == 0:
             return None, valid_seqs, invalid_seqs
 
@@ -498,8 +498,8 @@ def create_model_inputs_delta_valid_only(self):
 
         valid_mask = np.array(valid_mask, dtype=bool)
         indices_cpu = np.arange(0, batch_size)[valid_mask]
-        valid_seqs: List['SchedulerSequence'] = [self.running_seqs[i] for i in indices_cpu]
-        invalid_seqs: List['SchedulerSequence'] = [self.running_seqs[i] for i in range(batch_size) if not valid_mask[i]]
+        valid_seqs: list[SchedulerSequence] = [self.running_seqs[i] for i in indices_cpu]
+        invalid_seqs: list[SchedulerSequence] = [self.running_seqs[i] for i in range(batch_size) if not valid_mask[i]]
 
         num_decode_tokens = self.engine_strategy.get_num_decode_tokens()
         max_q_seqlen = num_decode_tokens
@@ -523,7 +523,7 @@ def create_model_inputs_delta_valid_only(self):
 
         return output, valid_seqs, invalid_seqs
 
-    def update_running_seqs(self, running: 'SeqList', inputs: Optional[ModelInputs]):
+    def update_running_seqs(self, running: 'SeqList', inputs: ModelInputs | None):
         """Update running seqs."""
         if self.config.role == EngineRole.Prefill:
             # p node will not update running seqs
diff --git a/lmdeploy/pytorch/engine/model_agent/__init__.py b/lmdeploy/pytorch/engine/model_agent/__init__.py
index 7cbf3fb33d..083e6a1fe4 100644
--- a/lmdeploy/pytorch/engine/model_agent/__init__.py
+++ b/lmdeploy/pytorch/engine/model_agent/__init__.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict
 
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, MiscConfig, ModelConfig, SpecDecodeConfig
 from lmdeploy.pytorch.devices import DeviceContext, get_device_manager
@@ -16,7 +15,7 @@ def build_model_agent(
     misc_config: MiscConfig,
     dist_ctx: DistContext = None,
     device_ctx: DeviceContext = None,
-    adapters: Dict[str, str] = None,
+    adapters: dict[str, str] = None,
     specdecode_config: SpecDecodeConfig = None,
 ):
     """Create model agent.
@@ -26,7 +25,7 @@ def build_model_agent(
         cache_config (CacheConfig): config of kv cache
         backend_config (BackendConfig): config of backend devices
         trust_remote_code (bool): To use the remote modeling code or not
-        adapters (Dict): lora adapters
+        adapters (dict): lora adapters
         tp (int): the number of devices to be used in tensor parallelism
         dtype (str): the data type of model weights and activations
         custom_module_map (str): customized nn module map
diff --git a/lmdeploy/pytorch/engine/model_agent/agent.py b/lmdeploy/pytorch/engine/model_agent/agent.py
index f9c2919962..eb31ff379c 100644
--- a/lmdeploy/pytorch/engine/model_agent/agent.py
+++ b/lmdeploy/pytorch/engine/model_agent/agent.py
@@ -5,7 +5,7 @@
 from dataclasses import dataclass, field, fields
 from multiprocessing.reduction import ForkingPickler
 from os import getenv
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import numpy as np
 import pybase64
@@ -75,13 +75,13 @@ def to_tensor(self):
 class BatchedOutputs:
     next_token_ids: torch.Tensor
     stopped: torch.Tensor
-    stop_pos: Optional[torch.Tensor] = None
-    logits: Optional[torch.Tensor] = None
-    model_metas: List[Dict[str, Any]] = None
-    logprobs: Optional[BatchedLogProbs] = None
+    stop_pos: torch.Tensor | None = None
+    logits: torch.Tensor | None = None
+    model_metas: list[dict[str, Any]] = None
+    logprobs: BatchedLogProbs | None = None
     new_token_timestamp: int = 0
-    extra_outputs: Optional[ExtraOutputs] = None
-    all_routed_experts: Optional[torch.Tensor] = None
+    extra_outputs: ExtraOutputs | None = None
+    all_routed_experts: torch.Tensor | None = None
 
     def to_cpu(self):
         """To cpu."""
@@ -176,7 +176,7 @@ def model_forward(
                 context=context,
             )
             output = model(**input_dict)
-            if not isinstance(output, Dict):
+            if not isinstance(output, dict):
                 output = dict(hidden_states=output)
             # InternVL-3.5-Flash will change the seqlen, model_metas during forward
             if getattr(context, 'is_model_meta_updated', False):
@@ -220,7 +220,7 @@ async def async_wait(self, timeout: float = 0.001):
         return self.all_vals
 
 
-SwapMap = Dict[int, int]
+SwapMap = dict[int, int]
 
 
 @dataclass
@@ -330,7 +330,7 @@ def __init__(
         misc_config: MiscConfig,
         dist_ctx: DistContext,
         device_ctx: DeviceContext,
-        adapters: Dict[str, str] = None,
+        adapters: dict[str, str] = None,
         specdecode_config: SpecDecodeConfig = None,
     ):
 
@@ -413,7 +413,7 @@ def __init__(
         self.step_inputs = StepInputs()
 
         # long context
-        self._prev_chunk_output: Dict = None
+        self._prev_chunk_output: dict = None
 
     @contextmanager
     def all_context(self):
@@ -742,8 +742,8 @@ async def _async_step(
         self,
         inputs: ModelInputs,
         delta: ModelInputsDelta = None,
-        swap_in_map: Dict = None,
-        swap_out_map: Dict = None,
+        swap_in_map: dict = None,
+        swap_out_map: dict = None,
         sampling_inputs: SamplingInputs = None,
         stopping_criteria: StoppingCriteria = None,
         return_logits: bool = False,
@@ -1130,7 +1130,7 @@ async def async_forward(self, inputs: ModelInputs):
         """Model forward.
 
         Args:
-            inputs (Dict): The input data comes from _make_inputs.
+            inputs (dict): The input data comes from _make_inputs.
             swap_in_map (SwapMap): Cache maps to swap in.
             swap_out_map (SwapMap): Cache maps to swap out.
         """
@@ -1173,7 +1173,7 @@ def _construct(item):
             model = self.patched_model.get_model()
             weights = ForkingPickler.loads(pybase64.b64decode(serialized_data))
             if request.load_format == 'flattened_bucket':
-                metadata: List[FlattenedTensorMetadata] = weights['metadata']
+                metadata: list[FlattenedTensorMetadata] = weights['metadata']
                 if metadata:
                     flattened_tensor: torch.Tensor = _construct(weights['flattened_tensor'])
                     bucket = FlattenedTensorBucket(flattened_tensor=flattened_tensor, metadata=metadata)
@@ -1210,7 +1210,7 @@ async def sleep(self, level: int = 1):
         self.state.to_sleep.clear()
 
     @torch.inference_mode()
-    def wakeup(self, tags: Optional[List[str]] = None):
+    def wakeup(self, tags: list[str] | None = None):
         """Wakeup."""
         if tags is None:
             tags = ['weights', 'kv_cache']
diff --git a/lmdeploy/pytorch/engine/mp_engine/base.py b/lmdeploy/pytorch/engine/mp_engine/base.py
index 660c65cf92..a5c16dd967 100644
--- a/lmdeploy/pytorch/engine/mp_engine/base.py
+++ b/lmdeploy/pytorch/engine/mp_engine/base.py
@@ -2,11 +2,14 @@
 import asyncio
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import Any, List, Optional
+from typing import Any
 
 from lmdeploy.messages import ResponseType
-from lmdeploy.pytorch.disagg.conn.protocol import (DistServeConnectionRequest, DistServeDropConnectionRequest,
-                                                   DistServeInitRequest)
+from lmdeploy.pytorch.disagg.conn.protocol import (
+    DistServeConnectionRequest,
+    DistServeDropConnectionRequest,
+    DistServeInitRequest,
+)
 from lmdeploy.utils import get_logger
 
 from ..base import EngineBase, EngineInstanceBase
@@ -54,7 +57,7 @@ def sleep(self, level: int):
         """sleep."""
         return self._collective_rpc('sleep', level)
 
-    def wakeup(self, tags: Optional[List[str]] = None):
+    def wakeup(self, tags: list[str] | None = None):
         """Wakeup."""
         return self._collective_rpc('wakeup', tags)
 
diff --git a/lmdeploy/pytorch/engine/mp_engine/base_worker.py b/lmdeploy/pytorch/engine/mp_engine/base_worker.py
index 58bffa825b..0e0fa0fa82 100644
--- a/lmdeploy/pytorch/engine/mp_engine/base_worker.py
+++ b/lmdeploy/pytorch/engine/mp_engine/base_worker.py
@@ -1,11 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
 from contextlib import asynccontextmanager
-from typing import TYPE_CHECKING, Any, List, Optional
+from typing import TYPE_CHECKING, Any
 
 from lmdeploy.messages import EngineOutput
-from lmdeploy.pytorch.disagg.conn.protocol import (DistServeConnectionRequest, DistServeDropConnectionRequest,
-                                                   DistServeInitRequest)
+from lmdeploy.pytorch.disagg.conn.protocol import (
+    DistServeConnectionRequest,
+    DistServeDropConnectionRequest,
+    DistServeInitRequest,
+)
 from lmdeploy.utils import get_logger
 
 logger = get_logger('lmdeploy')
@@ -101,7 +104,7 @@ def sleep(self, level: int = 1):
         """sleep."""
         return self.engine.sleep(level)
 
-    def wakeup(self, tags: Optional[List[str]] = None):
+    def wakeup(self, tags: list[str] | None = None):
         """Wakeup."""
         return self.engine.wakeup(tags)
 
diff --git a/lmdeploy/pytorch/engine/mp_engine/ray_engine.py b/lmdeploy/pytorch/engine/mp_engine/ray_engine.py
index 8d8d19008d..3c14d3fd0c 100644
--- a/lmdeploy/pytorch/engine/mp_engine/ray_engine.py
+++ b/lmdeploy/pytorch/engine/mp_engine/ray_engine.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
-from typing import Dict
 
 import ray
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -84,10 +83,10 @@ async def get_stream_task_result(self, stream_id: int):
         return result, stopped
 
 
-def _update_runtime_envs(runtime_env: Dict):
+def _update_runtime_envs(runtime_env: dict):
     """Update runtime envs."""
     new_envs = _envs.get_all_envs()
-    env_vars: Dict = runtime_env.get('env_vars', {})
+    env_vars: dict = runtime_env.get('env_vars', {})
     env_vars.update(new_envs)
     runtime_env['env_vars'] = env_vars
     return runtime_env
diff --git a/lmdeploy/pytorch/engine/mp_engine/zmq_rpc.py b/lmdeploy/pytorch/engine/mp_engine/zmq_rpc.py
index c958235e84..5d43f5ccf5 100644
--- a/lmdeploy/pytorch/engine/mp_engine/zmq_rpc.py
+++ b/lmdeploy/pytorch/engine/mp_engine/zmq_rpc.py
@@ -2,7 +2,7 @@
 import asyncio
 import inspect
 import pickle
-from typing import Callable, Dict
+from collections.abc import Callable
 from uuid import uuid4
 
 import zmq
@@ -39,7 +39,7 @@ def __init__(self):
         self.context = zmq.Context()
         self.socket = self.context.socket(zmq.ROUTER)
         self.port = self.socket.bind_to_random_port(address)
-        self.methods: Dict[str, Callable] = {}
+        self.methods: dict[str, Callable] = {}
         self.running = False
 
         # streaming
@@ -74,7 +74,7 @@ def send_multipart(self, client_id: bytes, data: bytes):
         except zmq.ZMQError as e:
             logger.error(f'Failed to send message to client[{client_id}]: {e}')
 
-    def call_method_default(self, client_id, method: Callable, request: Dict):
+    def call_method_default(self, client_id, method: Callable, request: dict):
         request_id = request.get('request_id')
         args = request.get('args', [])
         kwargs = request.get('kwargs', {})
@@ -85,7 +85,7 @@ def call_method_default(self, client_id, method: Callable, request: Dict):
             response = dict(success=False, request_id=request_id, error=str(e))
         self.send_multipart(client_id, response)
 
-    async def _method_async_task(self, client_id, request_id, method: Callable, args: tuple, kwargs: Dict):
+    async def _method_async_task(self, client_id, request_id, method: Callable, args: tuple, kwargs: dict):
         """Call method in a task."""
         try:
             result = await method(*args, **kwargs)
@@ -95,7 +95,7 @@ async def _method_async_task(self, client_id, request_id, method: Callable, args
         self.send_multipart(client_id, response)
 
     async def _method_async_streaming_task(self, stream_id: int, request_id: int, client_id: int, method: Callable,
-                                           args: tuple, kwargs: Dict):
+                                           args: tuple, kwargs: dict):
         """Call method in a task for streaming."""
 
         def __send_resp():
@@ -141,7 +141,7 @@ async def get_stream_output(self, stream_id: int):
             raise stream_out['error']
         return result, stopped
 
-    async def call_method_async(self, client_id, method: Callable, request: Dict):
+    async def call_method_async(self, client_id, method: Callable, request: dict):
         """Call method async."""
         request_id = request.get('request_id')
         method_name = request.get('method')
@@ -237,7 +237,7 @@ def __init__(self, port: int = 5555):
         self._listen_task = None
         self.running = False
 
-    def _set_reply_default(self, request_id: int, reply: Dict):
+    def _set_reply_default(self, request_id: int, reply: dict):
         """Default reply handler for sync socket."""
         logger.debug(f'recv reply request_id: {request_id}')
         future: asyncio.Future = self.pending.pop(request_id)
@@ -249,7 +249,7 @@ def _set_reply_default(self, request_id: int, reply: Dict):
         except Exception as e:
             logger.debug(f'Set future failed with exception: {e}')
 
-    def _set_reply(self, reply: Dict):
+    def _set_reply(self, reply: dict):
         request_id = reply['request_id']
         self._set_reply_default(request_id, reply)
 
diff --git a/lmdeploy/pytorch/engine/request.py b/lmdeploy/pytorch/engine/request.py
index 1a8dd2dd9c..86e37fa28c 100644
--- a/lmdeploy/pytorch/engine/request.py
+++ b/lmdeploy/pytorch/engine/request.py
@@ -2,8 +2,9 @@
 import asyncio
 import enum
 import logging
+from collections.abc import Awaitable, Callable, Coroutine
 from dataclasses import dataclass, field
-from typing import Any, Awaitable, Callable, Coroutine, Dict, List
+from typing import Any
 
 from lmdeploy.messages import RequestMetrics, ResponseType
 from lmdeploy.utils import get_logger
@@ -45,7 +46,7 @@ class Request:
     resp: Response = None
 
 
-ReqList = List[Request]
+ReqList = list[Request]
 
 
 def _run_until_complete(future: Awaitable):
@@ -69,7 +70,7 @@ class RequestSender:
     """
     sender_id: int
     manager: 'RequestManager'
-    resp_dict: Dict[int, List[Response]] = field(default_factory=dict)
+    resp_dict: dict[int, list[Response]] = field(default_factory=dict)
 
     @classmethod
     def new(cls, sender_id: int, manager: 'RequestManager'):
@@ -99,7 +100,7 @@ def _req_put(self, reqs: Any):
         """Async rq_que put."""
         self.req_que.put_nowait(reqs)
 
-    def _gather_request(self, req_types: List[RequestType], data: List[Any]):
+    def _gather_request(self, req_types: list[RequestType], data: list[Any]):
         """Gather requests."""
         if self.manager._loop_task is None:
             self.manager.create_loop_task()
@@ -119,7 +120,7 @@ def _gather_request(self, req_types: List[RequestType], data: List[Any]):
             reqs.append(req)
         return resps, reqs
 
-    def batched_send_async(self, req_types: List[RequestType], data: List[Any]):
+    def batched_send_async(self, req_types: list[RequestType], data: list[Any]):
         """Batched send request asynchronize."""
         resps, reqs = self._gather_request(req_types, data)
         self._req_put(reqs)
@@ -166,9 +167,9 @@ class RequestManager:
     """Request manager."""
 
     def __init__(self):
-        self.senders: Dict[int, RequestSender] = dict()
-        self.callbacks: Dict[RequestType, Callable] = dict()
-        self.request_priority: List[RequestType] = [
+        self.senders: dict[int, RequestSender] = dict()
+        self.callbacks: dict[RequestType, Callable] = dict()
+        self.request_priority: list[RequestType] = [
             RequestType.STOP_ENGINE, RequestType.ADD_SESSION, RequestType.STOP_SESSION, RequestType.END_SESSION,
             RequestType.ADD_MESSAGE
         ]
@@ -293,7 +294,7 @@ def has_requests(self):
             return False
         return not self.requests.empty()
 
-    async def get_all_requests(self) -> Dict[RequestType, List[Request]]:
+    async def get_all_requests(self) -> dict[RequestType, list[Request]]:
         """Get all requests in current queue."""
         num_reqs = self.requests.qsize()
         reqs: ReqList = []
@@ -315,7 +316,7 @@ def __proc_reqs(elem):
             __proc_reqs(elem)
 
         # gather requests
-        reqs_by_type: Dict[RequestType, List[Request]] = dict((t, []) for t in RequestType)
+        reqs_by_type: dict[RequestType, list[Request]] = dict((t, []) for t in RequestType)
         for req in reqs:
             reqs_by_type[req.type].append(req)
         return reqs_by_type
@@ -324,7 +325,7 @@ def bind_func(self, req_type: RequestType, callback: Callable):
         """Bind handler for given request type."""
         self.callbacks[req_type] = callback
 
-    def set_request_priority(self, priority: List[RequestType]):
+    def set_request_priority(self, priority: list[RequestType]):
         """Set the priority of request type."""
         self.request_priority = priority
 
diff --git a/lmdeploy/pytorch/envs.py b/lmdeploy/pytorch/envs.py
index 0e44e19ac1..e38892f845 100644
--- a/lmdeploy/pytorch/envs.py
+++ b/lmdeploy/pytorch/envs.py
@@ -1,15 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import contextlib
 import os
-from typing import Union
 
 
 def env_to_bool(
     env_var: str,
     default: bool = False,
     *,
-    true_values: Union[set, list] = {'true', '1', 'yes', 'on'},
-    false_values: Union[set, list] = {'false', '0', 'no', 'off'},
+    true_values: set | list = {'true', '1', 'yes', 'on'},
+    false_values: set | list = {'false', '0', 'no', 'off'},
 ):
     """Env to bool."""
     value = os.getenv(env_var)
@@ -80,7 +79,7 @@ def set_envs():
 
     def _patched_get_env(
         env_var: str,
-        default: Union[str, None] = None,
+        default: str | None = None,
     ):
         """Patched get_env."""
         if env_var in os.environ:
diff --git a/lmdeploy/pytorch/kernels/__init__.py b/lmdeploy/pytorch/kernels/__init__.py
index 28897648ed..ae4a278777 100644
--- a/lmdeploy/pytorch/kernels/__init__.py
+++ b/lmdeploy/pytorch/kernels/__init__.py
@@ -1,7 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from .w8a8_triton_kernels import (matmul_kernel_dynamic_quant, per_channel_quant, per_token_quant_int8,
-                                  rms_norm_dynamic_quant)
+from .w8a8_triton_kernels import (
+                                  matmul_kernel_dynamic_quant,
+                                  per_channel_quant,
+                                  per_token_quant_int8,
+                                  rms_norm_dynamic_quant,
+)
 
 __all__ = [
     'matmul_kernel_dynamic_quant',
diff --git a/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py
index ee0765257f..6ed063ef60 100644
--- a/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py
+++ b/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py
@@ -129,7 +129,7 @@ def apply_rotary_pos_emb(q: Tensor,
         k_embed (Tensor): output k, can be same as k
 
     Returns:
-        Tuple[Tensor, Tensor]: Embedded query and key.
+        tuple[Tensor, Tensor]: Embedded query and key.
     """
     if cos.device != q.device:
         cos = cos.to(device=q.device)
diff --git a/lmdeploy/pytorch/kernels/cuda/blocked_fp8_fused_moe.py b/lmdeploy/pytorch/kernels/cuda/blocked_fp8_fused_moe.py
index ad5804a4ac..b504234ba5 100644
--- a/lmdeploy/pytorch/kernels/cuda/blocked_fp8_fused_moe.py
+++ b/lmdeploy/pytorch/kernels/cuda/blocked_fp8_fused_moe.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modify from: https://github.com/vllm-project/vllm
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 import triton
diff --git a/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py b/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py
index 1c826f097e..e6717b5ef4 100644
--- a/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py
+++ b/lmdeploy/pytorch/kernels/cuda/blocked_gemm_fp8.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
 
 import torch
 import triton
@@ -102,7 +101,7 @@ def _quant_fp8_kernel(
         s_ptr += m_id_stride * stride_sm
 
 
-def _quant_fp8_launcher(A: Tensor, group_size: int, out: Tensor, scales: Tensor, scale_fmt: Optional[str] = None):
+def _quant_fp8_launcher(A: Tensor, group_size: int, out: Tensor, scales: Tensor, scale_fmt: str | None = None):
     """Quant online."""
     assert scale_fmt in (None, 'ue8m0')
     round_scale = 1 if scale_fmt == 'ue8m0' else 0
@@ -160,7 +159,7 @@ def quant_fp8(A: Tensor,
               group_size: int,
               dtype: torch.dtype = torch.float8_e4m3fn,
               trans_scale: bool = False,
-              scale_fmt: Optional[str] = None):
+              scale_fmt: str | None = None):
     """Quant fp8."""
     assert A.dim() == 2
     M, K = A.shape
@@ -177,7 +176,7 @@ def quant_fp8(A: Tensor,
 def quant_fp8_tma(A: Tensor,
                   group_size: int,
                   dtype: torch.dtype = torch.float8_e4m3fn,
-                  scale_fmt: Optional[str] = None):
+                  scale_fmt: str | None = None):
     """Quant fp8 tma."""
     from lmdeploy.pytorch.third_party.deep_gemm import ceil_div, get_m_alignment_for_contiguous_layout
     assert A.dim() == 2
diff --git a/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py b/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
index d03d3ddaf7..2f56a23f45 100644
--- a/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
+++ b/lmdeploy/pytorch/kernels/cuda/fill_kv_cache.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Literal, Optional
+from typing import Literal
 
 import torch
 import triton
@@ -399,9 +399,9 @@ def _fill_kv_cache_quant_kernel(
 
 
 def fill_kv_cache(k_states: Tensor,
-                  v_states: Optional[Tensor],
+                  v_states: Tensor | None,
                   k_caches: Tensor,
-                  v_caches: Optional[Tensor],
+                  v_caches: Tensor | None,
                   q_start_loc: Tensor,
                   q_seq_length: Tensor,
                   kv_seq_length: Tensor,
@@ -690,32 +690,32 @@ def _fill_kv_cache_blocked_fp8_kernel(
 
 
 def fill_kv_cache_blocked_fp8(k_states: Tensor,
-                              v_states: Optional[Tensor],
+                              v_states: Tensor | None,
                               k_caches: Tensor,
-                              v_caches: Optional[Tensor],
+                              v_caches: Tensor | None,
                               ks_caches: Tensor,
-                              vs_caches: Optional[Tensor],
+                              vs_caches: Tensor | None,
                               cu_seqlen_q: Tensor,
                               kv_seqlens: Tensor,
                               max_q_seqlen: int,
                               block_offsets: Tensor,
                               group_size: int = 128,
                               kv_layout: str = 'bshd',
-                              scale_fmt: Optional[str] = None):
+                              scale_fmt: str | None = None):
     """Fill key/value state to cache for paged attention with fp8 quantization.
 
     Args:
         k_states (Tensor): Key states of shape
             (seq_length, num_heads, head_dim).
-        v_states (Optional[Tensor]): Value states of shape
+        v_states (Tensor | None): Value states of shape
             (seq_length, num_heads, head_dim_v). If None, no value states
             are processed.
         k_caches (Tensor): 4D k cache, shape depends on ``kv_layout``.
-        v_caches (Optional[Tensor]): 4D v cache, shape depends on
+        v_caches (Tensor | None): 4D v cache, shape depends on
             ``kv_layout``. If None, no value caches are processed.
         ks_caches (Tensor): 4D k scale cache, shape depends on
             ``kv_layout``.
-        vs_caches (Optional[Tensor]): 4D v scale cache, shape depends on
+        vs_caches (Tensor | None): 4D v scale cache, shape depends on
             ``kv_layout``. If None, no value scale caches are processed.
         cu_seqlen_q (Tensor): Cumulative sequence lengths of queries,
             shape (batch_size + 1, ).
diff --git a/lmdeploy/pytorch/kernels/cuda/flashattention.py b/lmdeploy/pytorch/kernels/cuda/flashattention.py
index a01d151f9c..765c80aa1b 100644
--- a/lmdeploy/pytorch/kernels/cuda/flashattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/flashattention.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
-from typing import Sequence
+from collections.abc import Sequence
 
 import torch
 import triton
@@ -89,7 +89,7 @@ def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs, loop_start
             qk = tl.where(
                 qk_mask,
                 qk,
-                float(-1e30),
+                (-1e30),
             )
             m_i_new = tl.maximum(m_i, tl.max(qk, 1))
             qk -= m_i_new[:, None]
@@ -101,7 +101,7 @@ def _prefill_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, q1, k1_ptrs, loop_start
             qk = tl.where(
                 qk_mask,
                 qk,
-                float(-1e30),
+                (-1e30),
             )
             m_i_new = tl.maximum(m_i, tl.max(qk, 1))
             qk -= m_i_new[:, None]
diff --git a/lmdeploy/pytorch/kernels/cuda/fused_moe.py b/lmdeploy/pytorch/kernels/cuda/fused_moe.py
index fc19fc6a97..d99c398071 100644
--- a/lmdeploy/pytorch/kernels/cuda/fused_moe.py
+++ b/lmdeploy/pytorch/kernels/cuda/fused_moe.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modify from: https://github.com/vllm-project/vllm
-from typing import Callable
+from collections.abc import Callable
 
 import torch
 import triton
@@ -353,7 +353,7 @@ def get_start_end(exp_cum: torch.Tensor, exp_topk: torch.Tensor, topk: int):
     exp_start = start_end[0, :]
     exp_end = start_end[1, :]
 
-    out = exp_cum.new_empty((num_tokens * topk))
+    out = exp_cum.new_empty(num_tokens * topk)
 
     num_warps = 1
 
diff --git a/lmdeploy/pytorch/kernels/cuda/fused_moe_ep.py b/lmdeploy/pytorch/kernels/cuda/fused_moe_ep.py
index 6c0c792906..37cb8dcc13 100644
--- a/lmdeploy/pytorch/kernels/cuda/fused_moe_ep.py
+++ b/lmdeploy/pytorch/kernels/cuda/fused_moe_ep.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modify from dlblas: https://github.com/DeepLink-org/DLBlas
-from typing import List, Optional
 
 import torch
 import triton
@@ -221,7 +220,7 @@ def fused_moe_v3(
     topk_weights,
     w13_weight: torch.Tensor,
     w2_weight: torch.Tensor,
-    num_recv_tokens_per_expert: Optional[List[int]],
+    num_recv_tokens_per_expert: list[int] | None,
 ):
     if num_recv_tokens_per_expert is None:
         return hidden_states
diff --git a/lmdeploy/pytorch/kernels/cuda/gated_delta_rule.py b/lmdeploy/pytorch/kernels/cuda/gated_delta_rule.py
index 9dc75bd61b..f3f41877b1 100644
--- a/lmdeploy/pytorch/kernels/cuda/gated_delta_rule.py
+++ b/lmdeploy/pytorch/kernels/cuda/gated_delta_rule.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Sequence
+from collections.abc import Sequence
 
 import tilelang
 import tilelang.language as T
diff --git a/lmdeploy/pytorch/kernels/cuda/pagedattention.py b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
index aac072ab86..96a38425af 100644
--- a/lmdeploy/pytorch/kernels/cuda/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/cuda/pagedattention.py
@@ -1,7 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modify from: https://github.com/ModelTC/lightllm
 import math
-from typing import Literal, Sequence
+from collections.abc import Sequence
+from typing import Literal
 
 import torch
 import triton
diff --git a/lmdeploy/pytorch/kernels/cuda/rms_norm.py b/lmdeploy/pytorch/kernels/cuda/rms_norm.py
index 81b05138df..5238bab261 100644
--- a/lmdeploy/pytorch/kernels/cuda/rms_norm.py
+++ b/lmdeploy/pytorch/kernels/cuda/rms_norm.py
@@ -191,8 +191,8 @@ def test_rms_norm(bsz, ctx_len, feat_len, dtype):
 
         torch_cost = (t1 - t0) / N_REPEATS * 1000
         triton_cost = (t2 - t1) / N_REPEATS * 1000
-        print('input {} weight {} dtype {}\n  torch {:.3f} triton {:.3f} (ms)\n'.format(
-            input.shape, weight.shape, dtype, torch_cost, triton_cost))
+        print(f'input {input.shape} weight {weight.shape} dtype {dtype}\n' \
+                f'  torch {torch_cost:.3f} triton {triton_cost:.3f} (ms)\n')
 
     test_rms_norm(1, 8128, 5120, torch.float16)
     test_rms_norm(1, 8128, 5120, torch.float32)
diff --git a/lmdeploy/pytorch/kernels/dispatcher.py b/lmdeploy/pytorch/kernels/dispatcher.py
index fcf85c913f..77b785c66b 100644
--- a/lmdeploy/pytorch/kernels/dispatcher.py
+++ b/lmdeploy/pytorch/kernels/dispatcher.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import importlib
 import inspect
-from typing import Callable
+from collections.abc import Callable
 
 from lmdeploy.utils import get_logger
 
diff --git a/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py
index 4e5fa4a3e8..79ec75dde5 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/apply_rotary_pos_emb.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Tuple
 
 import dlinfer.ops as ext_ops
 from torch import Tensor
@@ -10,9 +9,9 @@ def apply_rotary_pos_emb(
     key_states: Tensor,
     cos: Tensor,
     sin: Tensor,
-    q_embed: Optional[Tensor],
-    k_embed: Optional[Tensor],
-) -> Tuple[Tensor, Tensor]:
+    q_embed: Tensor | None,
+    k_embed: Tensor | None,
+) -> tuple[Tensor, Tensor]:
     query_states_embed, key_states_embed = \
         ext_ops.apply_rotary_pos_emb(query_states,
                                      key_states,
diff --git a/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py b/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py
index 473e0404c9..c027b64a1e 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/awq_kernels.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
 
 import dlinfer.ops as ext_ops
 from torch import Tensor
@@ -9,7 +8,7 @@ def awq_linear(x: Tensor,
                qweight: Tensor,
                scales: Tensor,
                qzeros: Tensor,
-               bias: Optional[Tensor] = None,
+               bias: Tensor | None = None,
                all_reduce: bool = False,
                group_size: int = 0):
     return ext_ops.weight_quant_matmul(x.squeeze(0),
diff --git a/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py b/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py
index ec1cdeac94..9602886e9b 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/fill_kv_cache.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Sequence
+from collections.abc import Sequence
 
 import dlinfer.ops as ext_ops
 from torch import Tensor
@@ -11,8 +11,8 @@ def fill_kv_cache(
     key_caches: Tensor,
     value_caches: Tensor,
     kv_start_indices: Tensor,
-    k_scales_zeros: Sequence[Optional[Tensor]],
-    v_scales_zeros: Sequence[Optional[Tensor]],
+    k_scales_zeros: Sequence[Tensor | None],
+    v_scales_zeros: Sequence[Tensor | None],
     quant_bits: int = 0,
 ):
     """Fill key/value state to cache for paged attention."""
diff --git a/lmdeploy/pytorch/kernels/dlinfer/linear.py b/lmdeploy/pytorch/kernels/dlinfer/linear.py
index 686c4a8d39..962b221c78 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/linear.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/linear.py
@@ -1,9 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
 
 import dlinfer.ops as ext_ops
 from torch import Tensor
 
 
-def linear(x: Tensor, weight: Tensor, bias: Optional[Tensor] = None, all_reduce: bool = False, group: str = ''):
+def linear(x: Tensor, weight: Tensor, bias: Tensor | None = None, all_reduce: bool = False, group: str = ''):
     return ext_ops.linear(x, weight, bias=bias, all_reduce=all_reduce, group=group)
diff --git a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
index cc1a324bf4..bb8e2705eb 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/moe_gating_topk_softmax.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
 
 import dlinfer.ops as ext_ops
 from dlinfer.utils.type_annotation import MoeMetadata as DlinferMoeMetadata
@@ -7,6 +6,6 @@
 
 
 def moe_gating_topk_softmax(router_logits: Tensor, topk: int,
-                            moe_metadata: DlinferMoeMetadata) -> Tuple[Tensor, Tensor]:
+                            moe_metadata: DlinferMoeMetadata) -> tuple[Tensor, Tensor]:
     routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(router_logits, topk, moe_metadata)
     return routing_weights, selected_experts
diff --git a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
index 8996508aff..6916d8a082 100644
--- a/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
+++ b/lmdeploy/pytorch/kernels/dlinfer/pagedattention.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional, Sequence
+from collections.abc import Sequence
 
 import dlinfer.ops as ext_ops
 from torch import Tensor
@@ -23,12 +23,12 @@ def prefill_attention(
     num_q_heads: int,
     num_kv_heads: int,
     head_size_v: int,
-    attn_mask: Sequence[Optional[Tensor]],
-    softmax_scale: Optional[float],
-    is_unpaged_prefill: Optional[bool],
-    kv_scales: Optional[Tensor],
-    kv_zeros: Optional[Tensor],
-    quant_bits: Optional[int],
+    attn_mask: Sequence[Tensor | None],
+    softmax_scale: float | None,
+    is_unpaged_prefill: bool | None,
+    kv_scales: Tensor | None,
+    kv_zeros: Tensor | None,
+    quant_bits: int | None,
 ) -> Tensor:
     if is_unpaged_prefill:
         return ext_ops.prefill_attention(
@@ -86,10 +86,10 @@ def paged_token_attention(
     num_q_heads,
     num_kv_heads,
     head_size_v,
-    softmax_scale: Optional[float],
-    kv_scales: Optional[Tensor],
-    kv_zeros: Optional[Tensor],
-    quant_bits: Optional[int],
+    softmax_scale: float | None,
+    kv_scales: Tensor | None,
+    kv_zeros: Tensor | None,
+    quant_bits: int | None,
 ):
     return ext_ops.paged_decode_attention(
         q,
@@ -129,12 +129,12 @@ def paged_attention_fwd(
     num_heads: int,
     num_kv_heads: int,
     v_head_size: int,
-    attn_mask: Sequence[Optional[Tensor]] = (),
-    softmax_scale: Optional[float] = None,
-    is_unpaged_prefill: Optional[bool] = None,
-    kv_scales: Optional[Tensor] = None,
-    kv_zeros: Optional[Tensor] = None,
-    quant_bits: Optional[int] = 0,
+    attn_mask: Sequence[Tensor | None] = (),
+    softmax_scale: float | None = None,
+    is_unpaged_prefill: bool | None = None,
+    kv_scales: Tensor | None = None,
+    kv_zeros: Tensor | None = None,
+    quant_bits: int | None = 0,
 ):
     if not is_decoding:
         return prefill_attention(
diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
index 588f458bb6..a9661f960c 100644
--- a/lmdeploy/pytorch/messages.py
+++ b/lmdeploy/pytorch/messages.py
@@ -2,7 +2,7 @@
 import enum
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Dict, List
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 import torch
@@ -24,8 +24,8 @@
 logger = get_logger('lmdeploy')
 
 # vlm input type from pipeline
-InputEmbeddingType = List[np.ndarray]
-InputEmbeddingRangeType = List[List[int]]
+InputEmbeddingType = list[np.ndarray]
+InputEmbeddingRangeType = list[list[int]]
 
 
 @dataclass
@@ -52,12 +52,12 @@ class SamplingParam:
     repetition_penalty: float = 1.0
     ignore_eos: bool = False
     random_seed: int = None
-    stop_words: List[int] = field(default_factory=list)
-    bad_words: List[int] = field(default_factory=list)
+    stop_words: list[int] = field(default_factory=list)
+    bad_words: list[int] = field(default_factory=list)
     max_new_tokens: int = 512
     min_new_tokens: int = 0
     response_format: None | str = None
-    logits_processors: None | List[LogitsProcessor] = None
+    logits_processors: None | list[LogitsProcessor] = None
     out_logits: bool = False
     out_last_hidden_states: bool = False
     num_logprobs: int = -1
@@ -173,7 +173,7 @@ class MessageStatus(enum.Enum):
     MIGRATION_DONE = enum.auto()
 
 
-SeqMap = Dict[int, 'SchedulerSequence']
+SeqMap = dict[int, 'SchedulerSequence']
 
 
 @dataclass
@@ -189,7 +189,7 @@ class SequenceManager:
 
     def __init__(self, seq_meta: SequenceMeta) -> None:
         self._seq_map: SeqMap = dict()
-        self._status_seq_map: Dict[MessageStatus, SeqMap] = defaultdict(dict)
+        self._status_seq_map: dict[MessageStatus, SeqMap] = defaultdict(dict)
 
         self.seq_meta = seq_meta
         self._seq_count = 0
@@ -267,7 +267,7 @@ def add_sequence(self,
                      sampling_param: SamplingParam = None,
                      adapter_name: str = None,
                      multimodals: MultiModalInputs = None,
-                     input_embeddings: List[InputEmbeddings] = None,
+                     input_embeddings: list[InputEmbeddings] = None,
                      migration_request: None | MigrationRequest = None,
                      resp_cache: bool = False,
                      preserve_cache: bool = False) -> 'SchedulerSequence':
@@ -325,12 +325,12 @@ def _round_up(x, n):
 class HistoryEmbeddings:
     """History embeddings."""
 
-    def __init__(self, embeddings: List[InputEmbeddings] = None):
-        self._embeddings: List[InputEmbeddings] = []
+    def __init__(self, embeddings: list[InputEmbeddings] = None):
+        self._embeddings: list[InputEmbeddings] = []
         if embeddings is not None:
             self._embeddings.extend(embeddings)
 
-    def append(self, embeddings: List[InputEmbeddings]):
+    def append(self, embeddings: list[InputEmbeddings]):
         self._embeddings.extend(embeddings)
 
     def clone(self):
@@ -607,7 +607,7 @@ class SchedulerSequence:
     output_start_pos: int = 0
     meta: Any = None
     num_ignored_history: int = 0
-    model_meta: Dict[str, Any] = None
+    model_meta: dict[str, Any] = None
 
     # For Disaggregation
     migration_request: None | MigrationRequest = None
@@ -615,7 +615,7 @@ class SchedulerSequence:
     preserve_cache: bool = False
 
     # For logging
-    engine_events: List[EngineEvent] = field(default_factory=list)
+    engine_events: list[EngineEvent] = field(default_factory=list)
 
     # for router replay
     all_routed_experts: HistoryRouterExperts = field(default_factory=HistoryRouterExperts)
@@ -662,7 +662,7 @@ def token_ids(self) -> np.ndarray:
         return self.history_cache[start:end]
 
     @property
-    def input_embeddings(self) -> List[InputEmbeddings]:
+    def input_embeddings(self) -> list[InputEmbeddings]:
         """Get current embeddings."""
         start = self.history_image_num
         end = start + self._num_images
@@ -786,7 +786,7 @@ def record_event(
     ) -> None:
         self.engine_events.append(EngineEvent.new_event(event_type, timestamp))
 
-    def _update_embeddings(self, embeddings: List[InputEmbeddings]):
+    def _update_embeddings(self, embeddings: list[InputEmbeddings]):
         """Update input embeddings."""
         self._num_history_images += self._num_images
         if embeddings is None:
@@ -806,8 +806,8 @@ def _update_multimodals(self, multimodals: MultiModalInputs):
     def update_token_ids(self,
                          token_ids: Tensor,
                          multimodals: MultiModalInputs = None,
-                         embeddings: List[InputEmbeddings] = None,
-                         model_meta: Dict[str, Any] = None,
+                         embeddings: list[InputEmbeddings] = None,
+                         model_meta: dict[str, Any] = None,
                          mode: UpdateTokenMode = UpdateTokenMode.INPUTS,
                          **kwargs):
         """Update token ids, old token ids will be added to history."""
diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
index 538b9c6f3a..2ddd0605c2 100644
--- a/lmdeploy/pytorch/model_inputs.py
+++ b/lmdeploy/pytorch/model_inputs.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from dataclasses import dataclass, field, fields
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional
+from typing import TYPE_CHECKING, Any, Literal
 
 import numpy as np
 import torch
@@ -20,11 +20,11 @@
 
 @dataclass
 class DPMeta:
-    tp_sizes: List[int] = None
-    moe_tp_sizes: List[int] = None
+    tp_sizes: list[int] = None
+    moe_tp_sizes: list[int] = None
 
     @staticmethod
-    def _gather_tp_sizes(tp: int, seqlen: int, num_tokens: List[int], dist_ctx: dist.DistContext, layer_type: str):
+    def _gather_tp_sizes(tp: int, seqlen: int, num_tokens: list[int], dist_ctx: dist.DistContext, layer_type: str):
         """Gather tp size."""
         attn_tp = dist_ctx.dist_config.attn_tp
         if tp > 1 and tp != attn_tp:
@@ -38,7 +38,7 @@ def _gather_tp_sizes(tp: int, seqlen: int, num_tokens: List[int], dist_ctx: dist
         return tp_sizes
 
     @classmethod
-    def build(cls, seqlen: int, num_tokens: List[int]):
+    def build(cls, seqlen: int, num_tokens: list[int]):
         """Get dp meta."""
         dist_ctx = dist.get_dist_manager().current_context()
         dist_config = dist_ctx.dist_config
@@ -63,10 +63,10 @@ def sync_tp_size(self, tp_size: int):
 class VisionModelInputs:
     """Vision model inputs."""
     history_lengths: torch.LongTensor = None
-    input_embeddings: List[List[torch.Tensor]] = None
-    input_embedding_ranges: List[torch.LongTensor] = None
+    input_embeddings: list[list[torch.Tensor]] = None
+    input_embedding_ranges: list[torch.LongTensor] = None
     input_embedding_indexing: torch.BoolTensor = None
-    input_multimodals: List[MultiModalTensor] = None
+    input_multimodals: list[MultiModalTensor] = None
 
     def to_device(self, device: str, non_blocking: bool = False):
         """To device."""
@@ -125,7 +125,7 @@ def get_inputs(self, history_lengths: torch.Tensor, seq_lengths: torch.Tensor):
 class ModelInputsDelta:
     """Delta of ModelInputs."""
     # valid indices
-    indices: Optional[torch.Tensor]
+    indices: torch.Tensor | None
     # new block offsets
     block_offsets: torch.Tensor
     # cpu copy of indices
@@ -135,7 +135,7 @@ class ModelInputsDelta:
     sum_kv_seqlen: int
     is_decoding: bool = True
     # sliding window
-    num_ignored_history: Optional[torch.Tensor] = None
+    num_ignored_history: torch.Tensor | None = None
 
     @property
     def seq_length(self):
@@ -184,7 +184,7 @@ class ModelInputs:
     sum_kv_seqlen: int
     local_adapter_ids: torch.Tensor = None
     vision_inputs: VisionModelInputs = None
-    model_metas: List[Dict[str, Any]] = None
+    model_metas: list[dict[str, Any]] = None
     dp_meta: 'DPMeta' = None
     enable_microbatch: bool = False
     is_dummy: bool = False
@@ -222,7 +222,7 @@ def to_device(self, device: str, non_blocking: bool = False):
 
         return ModelInputs(**out_dict)
 
-    def build_dp_meta(self, num_tokens: List[int]):
+    def build_dp_meta(self, num_tokens: list[int]):
         """Build dp meta."""
         self.dp_meta = DPMeta.build(self.input_ids.numel(), num_tokens)
 
@@ -248,28 +248,28 @@ class StepContext:
     q_seqlens: torch.LongTensor
     kv_seqlens: torch.IntTensor
     q_start_loc: torch.LongTensor
-    kv_caches: List
+    kv_caches: list
     is_decoding: bool
     sum_kv_seqlen: int
     max_kv_seqlen: int = None
     local_adapter_ids: torch.LongTensor = None
     input_embeddings: torch.Tensor = None
     input_embedding_indexing: torch.Tensor = None
-    input_multimodals: List[MultiModalTensor] = None
+    input_multimodals: list[MultiModalTensor] = None
     vision_inputs: VisionModelInputs = None
     attn_metadata: Any = None
     kv_quant_policy: Literal[0, 4, 8] = 0
-    model_metas: List[Dict[str, Any]] = None
+    model_metas: list[dict[str, Any]] = None
     dp_meta: DPMeta = None
     enable_microbatch: bool = False
     # for draft model
     target_hidden_states: torch.Tensor = None
 
     # states for ssm
-    state_caches: List = None
+    state_caches: list = None
     state_offsets: torch.LongTensor = None
 
-    _outputs: Dict = field(default_factory=dict)
+    _outputs: dict = field(default_factory=dict)
 
     @classmethod
     def new(
@@ -277,8 +277,8 @@ def new(
         inputs: ModelInputs,
         model_config: ModelConfig,
         cache_config: CacheConfig,
-        kv_caches: List = None,
-        state_caches: List = None,
+        kv_caches: list = None,
+        state_caches: list = None,
         kv_quant_policy: Literal[0, 4, 8] = 0,
     ):
         """Build step context.
@@ -408,8 +408,8 @@ def build_context(
         inputs: ModelInputs,
         model_config: ModelConfig,
         cache_config: CacheConfig,
-        kv_caches: List = None,
-        state_caches: List = None,
+        kv_caches: list = None,
+        state_caches: list = None,
         kv_quant_policy: Literal[0, 4, 8] = 0,
     ):
         """Build context."""
diff --git a/lmdeploy/pytorch/models/baichuan.py b/lmdeploy/pytorch/models/baichuan.py
index 76aa659385..5ef0aedf59 100644
--- a/lmdeploy/pytorch/models/baichuan.py
+++ b/lmdeploy/pytorch/models/baichuan.py
@@ -1,14 +1,20 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_down_linear,
+    build_gateup_linear,
+    build_o_proj,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
@@ -67,8 +73,8 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -174,9 +180,9 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
         """forward."""
@@ -243,10 +249,10 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """forward."""
 
@@ -316,7 +322,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -341,8 +347,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -368,7 +374,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/chatglm2.py b/lmdeploy/pytorch/models/chatglm2.py
index 56e3169bb7..680a7001e6 100644
--- a/lmdeploy/pytorch/models/chatglm2.py
+++ b/lmdeploy/pytorch/models/chatglm2.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -10,10 +11,23 @@
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
-from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding,
-                                 build_rotary_params)
-from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_down_linear, build_gateup_linear, build_o_proj,
-                                        build_qkv_proj, build_rowwise_linear)
+from lmdeploy.pytorch.nn import (
+    ApplyRotaryEmb,
+    Attention,
+    RMSNorm,
+    RopeType,
+    SiluAndMul,
+    build_rotary_embedding,
+    build_rotary_params,
+)
+from lmdeploy.pytorch.nn.linear import (
+    build_colwise_linear,
+    build_down_linear,
+    build_gateup_linear,
+    build_o_proj,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
@@ -87,8 +101,8 @@ def _fill_rope(states: torch.Tensor, rope: torch.Tensor):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -211,9 +225,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -264,8 +278,8 @@ def _get_layer(self, layer_number: int):
     def forward(
         self,
         hidden_states: torch.LongTensor,
-        rotary_pos_emb: List[torch.Tensor],
-        past_key_values: Optional[List[torch.FloatTensor]],
+        rotary_pos_emb: list[torch.Tensor],
+        past_key_values: list[torch.FloatTensor] | None,
         attn_metadata: Any,
     ):
         """forward."""
@@ -573,12 +587,12 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
         images: torch.Tensor = None,
         image_mask: torch.Tensor = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """forward."""
 
@@ -633,7 +647,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         images: torch.Tensor = None,
         image_mask: torch.Tensor = None,
@@ -662,8 +676,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -714,8 +728,8 @@ def _get_model_metas(self, context: StepContext):
         return [dict(num_img_tokens=0) if meta is None else meta for meta in model_metas]
 
     def update_model_metas(self,
-                           past_key_values: List[List[torch.Tensor]],
-                           inputs_embeds: Optional[torch.Tensor] = None,
+                           past_key_values: list[list[torch.Tensor]],
+                           inputs_embeds: torch.Tensor | None = None,
                            context: StepContext = None):
         """Update model meta."""
         model_metas = self._get_model_metas(context)
@@ -790,7 +804,7 @@ def update_model_metas(self,
 
         return new_model_metas
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
 
@@ -850,8 +864,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None:
             self.vision_token_num = self.num_patches // 4
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
diff --git a/lmdeploy/pytorch/models/cogvlm.py b/lmdeploy/pytorch/models/cogvlm.py
index ad8adc9739..f697d791f9 100644
--- a/lmdeploy/pytorch/models/cogvlm.py
+++ b/lmdeploy/pytorch/models/cogvlm.py
@@ -1,7 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from argparse import Namespace
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -14,8 +15,12 @@
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
-from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_colwise_linear,
+    build_merged_colwise_linear,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
@@ -92,8 +97,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
         lang_ids: torch.LongTensor = None,
         vision_ids: torch.LongTensor = None,
@@ -262,9 +267,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
         lang_ids: torch.LongTensor = None,
         vision_ids: torch.LongTensor = None,
@@ -574,11 +579,11 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
         images: torch.Tensor = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         lang_ids: torch.LongTensor = None,
         vision_ids: torch.LongTensor = None,
     ):
@@ -661,7 +666,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         images: torch.Tensor = None,
         inputs_embeds: torch.Tensor = None,
@@ -692,8 +697,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -746,7 +751,7 @@ def prepare_inputs_for_generation(
             vision_ids=vis_ids,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
@@ -798,8 +803,8 @@ def _get_model_metas(self, context: StepContext):
         return [dict(num_img_tokens=0) if meta is None else meta for meta in model_metas]
 
     def update_model_metas(self,
-                           past_key_values: List[List[torch.Tensor]],
-                           inputs_embeds: Optional[torch.Tensor] = None,
+                           past_key_values: list[list[torch.Tensor]],
+                           inputs_embeds: torch.Tensor | None = None,
                            context: StepContext = None):
         """Update model meta."""
         model_metas = self._get_model_metas(context)
@@ -887,7 +892,7 @@ def __init__(self, config: PretrainedConfig, dtype) -> None:
             # cogvlm2
             self.vision_token_num = 2 + (image_size // patch_size // 2)**2
 
-    def preprocess_input(self, input_ids: List[int], input_multimodals=None, **kwargs) -> PreprocessInputResult:
+    def preprocess_input(self, input_ids: list[int], input_multimodals=None, **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
             return input_ids, input_multimodals
diff --git a/lmdeploy/pytorch/models/deepseek.py b/lmdeploy/pytorch/models/deepseek.py
index 44239b9c98..78b69acadc 100644
--- a/lmdeploy/pytorch/models/deepseek.py
+++ b/lmdeploy/pytorch/models/deepseek.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -62,8 +63,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -260,9 +261,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -317,10 +318,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -393,7 +394,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -418,8 +419,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -445,8 +446,8 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                             expert_params_mapping: List):
+    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                             expert_params_mapping: list):
         """Load weight experts."""
         for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping:
             if weight_name not in name:
@@ -459,7 +460,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             param = params_dict[name]
             load_weight(param, loaded_weight)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/deepseek_mtp.py b/lmdeploy/pytorch/models/deepseek_mtp.py
index 1c315f4778..32190d8788 100644
--- a/lmdeploy/pytorch/models/deepseek_mtp.py
+++ b/lmdeploy/pytorch/models/deepseek_mtp.py
@@ -1,16 +1,29 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding,
-                                 build_rotary_params)
-from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_down_linear, build_gateup_linear, build_o_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn import (
+    ApplyRotaryEmb,
+    Attention,
+    RMSNorm,
+    RopeType,
+    SiluAndMul,
+    build_rotary_embedding,
+    build_rotary_params,
+)
+from lmdeploy.pytorch.nn.linear import (
+    build_colwise_linear,
+    build_down_linear,
+    build_gateup_linear,
+    build_o_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.nn.moe import build_fused_moe
 from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters, get_rope_theta
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
@@ -161,8 +174,8 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -413,8 +426,8 @@ def forward(
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        past_key_value: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_value: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         attn_metadata: Any = None,
         spec_step_index: int = 0,
     ) -> torch.Tensor:
@@ -477,8 +490,8 @@ def forward(
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
         previous_hidden_states: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         attn_metadata: Any = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
@@ -541,9 +554,9 @@ def forward(
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
         target_hidden_states: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.Tensor | None = None,
         spec_step_idx: int = 0,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids,
@@ -582,8 +595,8 @@ def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta, input_ids: torch.Ten
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -600,8 +613,8 @@ def prepare_inputs_for_generation(
             target_hidden_states=target_hidden_states,
         )
 
-    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                             expert_params_mapping: List):
+    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                             expert_params_mapping: list):
         """Load weight experts."""
         for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping:
             if weight_name not in name:
@@ -614,8 +627,8 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             param = params_dict[name]
             load_weight(param, loaded_weight)
 
-    def _load_weight_attention(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                               update_pe_mapping: List):
+    def _load_weight_attention(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                               update_pe_mapping: list):
         """Load weight attention."""
         device = next(iter(params_dict.values())).device
 
@@ -707,7 +720,7 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor):
                 param = params_dict[name]
                 load_weight(param, loaded_weight)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
 
         def __skip_nextn(name, nextn_keys):
diff --git a/lmdeploy/pytorch/models/deepseek_v2.py b/lmdeploy/pytorch/models/deepseek_v2.py
index bab833f2f7..087d744bf9 100644
--- a/lmdeploy/pytorch/models/deepseek_v2.py
+++ b/lmdeploy/pytorch/models/deepseek_v2.py
@@ -1,10 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import math
+from collections.abc import Iterable
 from copy import deepcopy
 from enum import Enum, auto
 from os import getenv
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -13,11 +14,24 @@
 import lmdeploy.pytorch.distributed as dist
 from lmdeploy.pytorch.distributed import get_dist_manager, get_ep_world_rank, get_tp_world_rank
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager, get_step_ctx_manager
-from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, ParallelEmbedding, RMSNorm, RopeType, SiluAndMul,
-                                 build_rotary_embedding, build_rotary_params)
+from lmdeploy.pytorch.nn import (
+    ApplyRotaryEmb,
+    Attention,
+    ParallelEmbedding,
+    RMSNorm,
+    RopeType,
+    SiluAndMul,
+    build_rotary_embedding,
+    build_rotary_params,
+)
 from lmdeploy.pytorch.nn.eplb import EPLBDispatchInfo, EPLBManager
-from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_down_linear, build_gateup_linear, build_o_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_colwise_linear,
+    build_down_linear,
+    build_gateup_linear,
+    build_o_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.nn.moe import MoeType, SoftmaxTopK, build_fused_moe
 from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters, get_rope_theta
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
@@ -511,8 +525,8 @@ def _qkv_proj(self, hidden_states: torch.Tensor, num_heads: int):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -836,11 +850,11 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
 
         if residual is None:
             residual = hidden_states
@@ -866,9 +880,9 @@ def forward(
     def forward_yield(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
         tag: Any = None,
     ):
@@ -987,10 +1001,10 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """forward."""
         if inputs_embeds is None:
@@ -1018,10 +1032,10 @@ def forward(
     def forward_microbatch(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """forward_microbatch."""
         assert self.config.moe_layer_freq == 1
@@ -1071,9 +1085,9 @@ def forward_microbatch(
 
     def forward_yieldlayers(self,
                             hidden_states: torch.Tensor,
-                            rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-                            past_key_values: Optional[List[torch.FloatTensor]] = None,
-                            residual: Optional[torch.Tensor] = None,
+                            rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+                            past_key_values: list[torch.FloatTensor] | None = None,
+                            residual: torch.Tensor | None = None,
                             attn_metadata: Any = None,
                             start_idx: int = -1,
                             end_idx: int = -1,
@@ -1120,7 +1134,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -1153,8 +1167,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -1170,8 +1184,8 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                             expert_params_mapping: List):
+    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                             expert_params_mapping: list):
         """Load weight experts."""
         for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping:
             if weight_name not in name:
@@ -1184,8 +1198,8 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             param = params_dict[name]
             load_weight(param, loaded_weight)
 
-    def _load_weight_attention(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                               update_pe_mapping: List):
+    def _load_weight_attention(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                               update_pe_mapping: list):
         """Load weight attention."""
         device = next(iter(params_dict.values())).device
 
@@ -1277,7 +1291,7 @@ def __load_kcvc_blocked_fp8(name: str, loaded_weight: torch.Tensor):
                 param = params_dict[name]
                 load_weight(param, loaded_weight)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
 
         def __skip_nextn(name, nextn_keys):
diff --git a/lmdeploy/pytorch/models/deepseek_v32.py b/lmdeploy/pytorch/models/deepseek_v32.py
index 19ee10d420..6954e47fca 100644
--- a/lmdeploy/pytorch/models/deepseek_v32.py
+++ b/lmdeploy/pytorch/models/deepseek_v32.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Sequence, Tuple
+from collections.abc import Sequence
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -7,15 +8,29 @@
 
 from lmdeploy.pytorch.distributed import get_dist_manager, get_ep_world_rank
 from lmdeploy.pytorch.model_inputs import StepContextManager
-from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, RMSNorm, RopeType, build_rotary_embedding,
-                                 build_rotary_params)
+from lmdeploy.pytorch.nn import (
+    ApplyRotaryEmb,
+    Attention,
+    RMSNorm,
+    RopeType,
+    build_rotary_embedding,
+    build_rotary_params,
+)
 from lmdeploy.pytorch.nn.eplb import EPLBManager
 from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_o_proj, build_rowwise_linear
 from lmdeploy.pytorch.nn.nsa import IndexerTopKFP8
 from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters, get_rope_theta
 
-from .deepseek_v2 import (DeepseekV2Attention, DeepseekV2BMM, DeepseekV2DecoderLayer, DeepseekV2ForCausalLM,
-                          DeepseekV2MLP, DeepseekV2Model, DeepseekV2MoE, yarn_get_mscale)
+from .deepseek_v2 import (
+    DeepseekV2Attention,
+    DeepseekV2BMM,
+    DeepseekV2DecoderLayer,
+    DeepseekV2ForCausalLM,
+    DeepseekV2MLP,
+    DeepseekV2Model,
+    DeepseekV2MoE,
+    yarn_get_mscale,
+)
 
 
 def rotate_activation(x: torch.Tensor) -> torch.Tensor:
@@ -88,7 +103,7 @@ def forward(self,
                 x: torch.Tensor,
                 qr: torch.Tensor,
                 freqs_cis: torch.Tensor,
-                index_cache: Tuple[torch.Tensor, torch.Tensor],
+                index_cache: tuple[torch.Tensor, torch.Tensor],
                 attn_metadata: Any = None):
         q = self.wq_b(qr)
         q = q.unflatten(-1, (-1, self.head_dim))
@@ -270,7 +285,7 @@ def _qkv_proj(self, hidden_states: torch.Tensor, num_heads: int):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
         past_key_value: Sequence[torch.Tensor] = None,
         attn_metadata: Any = None,
     ):
diff --git a/lmdeploy/pytorch/models/deepseek_vl2.py b/lmdeploy/pytorch/models/deepseek_vl2.py
index 290b9a4fc0..69cb35fd7a 100644
--- a/lmdeploy/pytorch/models/deepseek_vl2.py
+++ b/lmdeploy/pytorch/models/deepseek_vl2.py
@@ -1,7 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/main/deepseek_vl2/models/modeling_deepseek_vl_v2.py
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -172,9 +173,9 @@ def _init_vision_module(
 
     def prepare_inputs_embeds(self,
                               input_ids: torch.LongTensor,
-                              images: Optional[torch.FloatTensor] = None,
-                              images_seq_mask: Optional[torch.LongTensor] = None,
-                              images_spatial_crop: Optional[torch.LongTensor] = None,
+                              images: torch.FloatTensor | None = None,
+                              images_seq_mask: torch.LongTensor | None = None,
+                              images_spatial_crop: torch.LongTensor | None = None,
                               **ignore_kwargs):
         """
 
@@ -306,7 +307,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         pixel_values: torch.Tensor = None,
         image_mask: torch.Tensor = None,
@@ -340,7 +341,7 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         inputs_embeds: torch.Tensor = None,
         context: StepContext = None,
     ):
@@ -382,7 +383,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
 
         lang_prefix = 'language.'
@@ -423,8 +424,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None:
         self.patch_size = vision_config.patch_size
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
diff --git a/lmdeploy/pytorch/models/gemma.py b/lmdeploy/pytorch/models/gemma.py
index 4010dab882..ebf1804dee 100644
--- a/lmdeploy/pytorch/models/gemma.py
+++ b/lmdeploy/pytorch/models/gemma.py
@@ -1,7 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import math
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -9,10 +10,22 @@
 from transformers.configuration_utils import PretrainedConfig
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
-from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, GeluAndMul, RMSNorm, RopeType, build_rotary_embedding,
-                                 build_rotary_embedding_from_config)
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn import (
+    ApplyRotaryEmb,
+    Attention,
+    GeluAndMul,
+    RMSNorm,
+    RopeType,
+    build_rotary_embedding,
+    build_rotary_embedding_from_config,
+)
+from lmdeploy.pytorch.nn.linear import (
+    build_down_linear,
+    build_gateup_linear,
+    build_o_proj,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
@@ -96,9 +109,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        rotary_pos_emb_local: Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        rotary_pos_emb_local: tuple[torch.FloatTensor, torch.FloatTensor] | None = None,
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
         global_attn_masks: torch.Tensor = None,
         local_attn_masks: torch.Tensor = None,
@@ -290,10 +303,10 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        rotary_pos_emb_local: Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] = None,
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        rotary_pos_emb_local: tuple[torch.FloatTensor, torch.FloatTensor] | None = None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
         global_attn_masks: torch.Tensor = None,
         local_attn_masks: torch.Tensor = None,
@@ -340,7 +353,7 @@ def __init__(self,
                  embedding_dim: int,
                  padding_idx: int,
                  dtype=torch.dtype,
-                 embed_scale: Optional[float] = 1.0):
+                 embed_scale: float | None = 1.0):
         super().__init__(num_embeddings, embedding_dim, padding_idx, dtype=dtype)
         self.embed_scale = embed_scale
 
@@ -428,10 +441,10 @@ def build_rope_emb(self, config: PretrainedConfig):
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         global_attn_masks: torch.Tensor = None,
         local_attn_masks: torch.Tensor = None,
     ):
@@ -517,7 +530,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         global_attn_masks: torch.Tensor = None,
@@ -551,8 +564,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -582,7 +595,7 @@ def update_weights(self):
         """Update weights."""
         self.lm_head.weight = self.model.embed_tokens.weight
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/gemma3_vl.py b/lmdeploy/pytorch/models/gemma3_vl.py
index 8f4ea8e972..065628cdb6 100644
--- a/lmdeploy/pytorch/models/gemma3_vl.py
+++ b/lmdeploy/pytorch/models/gemma3_vl.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -92,8 +93,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None:
         self.vision_token_num = self.num_patches // 4
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
@@ -163,7 +164,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         pixel_values: torch.FloatTensor = None,
         image_mask: torch.Tensor = None,
@@ -302,7 +303,7 @@ def prepare_inputs_for_generation(
     def tie_weights(self):
         return self.language_model.tie_weights()
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
diff --git a/lmdeploy/pytorch/models/glm4.py b/lmdeploy/pytorch/models/glm4.py
index fe1193e76b..91644822c8 100644
--- a/lmdeploy/pytorch/models/glm4.py
+++ b/lmdeploy/pytorch/models/glm4.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -7,8 +8,13 @@
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_down_linear,
+    build_gateup_linear,
+    build_o_proj,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
@@ -71,8 +77,8 @@ def _fill_rope(states: torch.Tensor, rope: torch.Tensor):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -199,9 +205,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
         if residual is None:
@@ -261,10 +267,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -321,7 +327,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -351,8 +357,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -378,7 +384,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/glm4_1v.py b/lmdeploy/pytorch/models/glm4_1v.py
index 9b89164bef..6a7ea8795d 100644
--- a/lmdeploy/pytorch/models/glm4_1v.py
+++ b/lmdeploy/pytorch/models/glm4_1v.py
@@ -2,7 +2,8 @@
 # adapted from:
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/glm4v/modeling_glm4v.py
 
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Callable, Iterable
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -21,7 +22,7 @@
 from .utils.model import DeployModelMixin, vlm_model
 
 
-def _apply_mrope_selection(hidden_states: torch.Tensor, mrope_position_ids: torch.Tensor, mrope_section: List[int],
+def _apply_mrope_selection(hidden_states: torch.Tensor, mrope_position_ids: torch.Tensor, mrope_section: list[int],
                            position_ids: torch.Tensor, rotary_emb_func: Callable):
     _mrope_position_ids = torch.zeros(3, position_ids.shape[-1], dtype=position_ids.dtype, device=position_ids.device)
     _mrope_position_ids[:, :mrope_position_ids.shape[-1]] = mrope_position_ids
@@ -71,10 +72,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         mrope_position_ids: torch.LongTensor = None,
     ):
         """Rewrite of LlamaModel.forward."""
@@ -361,7 +362,7 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
                                          is_tp=True)
 
     def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
-                rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor]) -> torch.Tensor:
+                rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor]) -> torch.Tensor:
         seq_length = hidden_states.shape[0]
         # qkv proj
         qkv_states = self.qkv(hidden_states)
@@ -400,7 +401,7 @@ def forward(self,
                 hidden_states,
                 cu_seqlens,
                 rotary_pos_emb,
-                residual: Optional[torch.Tensor] = None) -> torch.Tensor:
+                residual: torch.Tensor | None = None) -> torch.Tensor:
         if residual is None:
             residual = hidden_states
             hidden_states = self.norm1(hidden_states)
@@ -478,7 +479,7 @@ def rot_pos_emb(self, grid_thw):
         return rotary_pos_emb, pos_ids
 
     def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor,
-                grid_thw: torch.Tensor, image_type_ids: List[torch.Tensor]) -> torch.Tensor:
+                grid_thw: torch.Tensor, image_type_ids: list[torch.Tensor]) -> torch.Tensor:
         """forward."""
         hidden_states = self.patch_embed(hidden_states)
         hidden_states = self.post_conv_layernorm(hidden_states)
@@ -551,14 +552,14 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         mrope_position_ids: torch.Tensor = None,
         pixel_values: torch.Tensor = None,
         vis_cu_seqlens: torch.Tensor = None,
         vis_pos_emb: torch.Tensor = None,
-        image_type_ids: List[torch.Tensor] = None,
+        image_type_ids: list[torch.Tensor] = None,
         grid_thw: torch.Tensor = None,
         image_mask: torch.Tensor = None,
         **kwargs,
@@ -602,8 +603,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -672,7 +673,7 @@ def rename_weight(cls, name: str) -> str:
             return name[len('model.'):]
         return name
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
@@ -826,8 +827,8 @@ def _update_model_meta_prefilling(self, context: StepContext):
         return new_model_metas
 
     def update_model_metas(self,
-                           past_key_values: List[List[torch.Tensor]],
-                           inputs_embeds: Optional[torch.Tensor] = None,
+                           past_key_values: list[list[torch.Tensor]],
+                           inputs_embeds: torch.Tensor | None = None,
                            context: StepContext = None):
         """Update model meta."""
         if context.is_decoding:
@@ -847,8 +848,8 @@ def __init__(self, config: PretrainedConfig) -> None:
         self.config = config
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
diff --git a/lmdeploy/pytorch/models/glm4_moe.py b/lmdeploy/pytorch/models/glm4_moe.py
index 1c6b92ad05..671a4e6e9a 100644
--- a/lmdeploy/pytorch/models/glm4_moe.py
+++ b/lmdeploy/pytorch/models/glm4_moe.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -78,8 +79,8 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -298,9 +299,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
         if residual is None:
@@ -358,10 +359,10 @@ def _build_rotary_embedding(self, config: PretrainedConfig):
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -438,7 +439,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -463,8 +464,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -490,8 +491,8 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                             expert_params_mapping: List):
+    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                             expert_params_mapping: list):
         """Load weight experts."""
         # load fused weights
         if any([k in name for k in ['fused_w1w3', 'fused_w2']]):
@@ -508,7 +509,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             param = params_dict[name]
             load_weight(param, loaded_weight)
 
-    def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter]):
+    def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter]):
         """Load weight of fused expert weights."""
         num_experts = self.config.num_experts
         fused_gateup_name = 'fused_w1w3'
@@ -533,7 +534,7 @@ def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, par
                 w2 = loaded_weight.narrow(dim=0, start=chunk_size * expert_id, length=chunk_size)
                 load_weight(param, w2, expert_id=expert_id, shard_id='down')
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/glm4moe_mtp.py b/lmdeploy/pytorch/models/glm4moe_mtp.py
index 743920e11e..4e5ec8c818 100644
--- a/lmdeploy/pytorch/models/glm4moe_mtp.py
+++ b/lmdeploy/pytorch/models/glm4moe_mtp.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Iterable
+from collections.abc import Iterable
 
 import torch
 from torch import nn
diff --git a/lmdeploy/pytorch/models/gpt_oss.py b/lmdeploy/pytorch/models/gpt_oss.py
index 0caf956675..51f83af244 100644
--- a/lmdeploy/pytorch/models/gpt_oss.py
+++ b/lmdeploy/pytorch/models/gpt_oss.py
@@ -1,7 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import functools
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Callable, Iterable
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -104,8 +105,8 @@ def weight_loader_sinks(cls, param: nn.Parameter, loaded_weight: torch.Tensor):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of forward."""
@@ -160,7 +161,7 @@ def _impl(self, gateup: torch.Tensor) -> torch.Tensor:
         return (up + 1) * glu
 
     @staticmethod
-    @functools.lru_cache(maxsize=None)
+    @functools.cache
     def build(limit: float, alpha: float):
         return GateupAct(limit, alpha)
 
@@ -306,9 +307,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
         all_routed_experts: torch.Tensor = None,
     ):
@@ -363,10 +364,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         all_routed_experts: torch.Tensor = None,
     ):
         """Rewrite of forward."""
@@ -437,7 +438,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -472,8 +473,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -499,7 +500,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def _load_weight_experts_gate_up(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str,
+    def _load_weight_experts_gate_up(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str,
                                                                                                      nn.Parameter]):
         """Load weight of experts gate up."""
         num_experts = self.config.num_local_experts
@@ -517,7 +518,7 @@ def _load_weight_experts_gate_up(self, name: str, loaded_weight: torch.Tensor, p
             load_weight(param, w1, expert_id=expert_id, shard_id='gate')
             load_weight(param, w3, expert_id=expert_id, shard_id='up')
 
-    def _load_weight_experts_down(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter]):
+    def _load_weight_experts_down(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter]):
         """Load weight of experts down."""
         num_experts = self.config.num_local_experts
 
@@ -532,7 +533,7 @@ def _load_weight_experts_down(self, name: str, loaded_weight: torch.Tensor, para
             w2 = loaded_weight[expert_id]
             load_weight(param, w2, expert_id=expert_id, shard_id='down')
 
-    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter]):
+    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter]):
         """Load weight of fused expert weights."""
         if 'gate_up' in name:
             self._load_weight_experts_gate_up(name, loaded_weight, params_dict)
@@ -540,7 +541,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
         elif 'down' in name:
             self._load_weight_experts_down(name, loaded_weight, params_dict)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/internlm.py b/lmdeploy/pytorch/models/internlm.py
index dcd0e40a76..ad53d6970b 100644
--- a/lmdeploy/pytorch/models/internlm.py
+++ b/lmdeploy/pytorch/models/internlm.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -8,8 +9,13 @@
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_down_linear,
+    build_gateup_linear,
+    build_o_proj,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
@@ -60,8 +66,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -171,9 +177,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -248,10 +254,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -324,7 +330,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -349,8 +355,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -376,7 +382,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/internlm2.py b/lmdeploy/pytorch/models/internlm2.py
index 661ce2ddbc..acf54e40ef 100644
--- a/lmdeploy/pytorch/models/internlm2.py
+++ b/lmdeploy/pytorch/models/internlm2.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -61,8 +62,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of InternLM2Attention.forward."""
@@ -172,9 +173,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -230,10 +231,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of forward."""
 
@@ -298,7 +299,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -319,8 +320,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -346,7 +347,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_lora_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], adapter_id: int):
+    def load_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]], adapter_id: int):
         """Load lora weights."""
 
         from lmdeploy.pytorch.adapter.adapter import load_lora_weights
@@ -370,7 +371,7 @@ def _rearange_wqkv(weights):
         weights_iter = _rearange_wqkv(weights)
         load_lora_weights(self, weights_iter, adapter_id)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/internlm2_reward.py b/lmdeploy/pytorch/models/internlm2_reward.py
index 886c884d6a..a393b57135 100644
--- a/lmdeploy/pytorch/models/internlm2_reward.py
+++ b/lmdeploy/pytorch/models/internlm2_reward.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -41,7 +42,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -66,8 +67,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -89,7 +90,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_lora_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], adapter_id: int):
+    def load_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]], adapter_id: int):
         """Load lora weights."""
 
         from lmdeploy.pytorch.adapter.adapter import load_lora_weights
@@ -113,7 +114,7 @@ def _rearange_wqkv(weights):
         weights_iter = _rearange_wqkv(weights)
         load_lora_weights(self, weights_iter, adapter_id)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/internlm2_ve.py b/lmdeploy/pytorch/models/internlm2_ve.py
index 3e5ace9016..1d084eb4ab 100644
--- a/lmdeploy/pytorch/models/internlm2_ve.py
+++ b/lmdeploy/pytorch/models/internlm2_ve.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -54,12 +55,12 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
-        vision_embedding_indexing: Optional[torch.Tensor] = None,
-        text_embedding_indexing: Optional[torch.Tensor] = None,
+        vision_embedding_indexing: torch.Tensor | None = None,
+        text_embedding_indexing: torch.Tensor | None = None,
     ):
 
         if residual is None:
@@ -141,12 +142,12 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        vision_embedding_indexing: Optional[torch.Tensor] = None,
-        text_embedding_indexing: Optional[torch.Tensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        vision_embedding_indexing: torch.Tensor | None = None,
+        text_embedding_indexing: torch.Tensor | None = None,
     ):
         """Rewrite of forward."""
 
@@ -216,11 +217,11 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
-        vision_embedding_indexing: Optional[torch.Tensor] = None,
-        text_embedding_indexing: Optional[torch.Tensor] = None,
+        vision_embedding_indexing: torch.Tensor | None = None,
+        text_embedding_indexing: torch.Tensor | None = None,
         **kwargs,
     ):
         """Model forward, return logits."""
@@ -259,8 +260,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -286,7 +287,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/internlm3.py b/lmdeploy/pytorch/models/internlm3.py
index 3005232c06..0f1af8f769 100644
--- a/lmdeploy/pytorch/models/internlm3.py
+++ b/lmdeploy/pytorch/models/internlm3.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -62,8 +63,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of InternLM3Attention.forward."""
@@ -174,9 +175,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -233,10 +234,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of InternLM3Model.forward."""
 
@@ -305,7 +306,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -326,8 +327,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -353,7 +354,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/interns1_pro.py b/lmdeploy/pytorch/models/interns1_pro.py
index 51ed9deaf6..a986a49887 100644
--- a/lmdeploy/pytorch/models/interns1_pro.py
+++ b/lmdeploy/pytorch/models/interns1_pro.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -80,7 +81,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         pixel_values: torch.Tensor = None,
@@ -150,8 +151,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -239,8 +240,8 @@ def rename_weight(cls, name: str) -> str:
             return name[len('model.'):]
         return name
 
-    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                             expert_params_mapping: List):
+    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                             expert_params_mapping: list):
         """Load weight experts."""
 
         for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping:
@@ -255,8 +256,8 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             load_weight(param, loaded_weight)
 
     # modify from vllm qwen3vlmoe fused expert loading
-    def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                                   fused_expert_params_mapping: List):
+    def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                                   fused_expert_params_mapping: list):
         """Load weight of fused expert weights."""
         num_experts = self.config.text_config.num_experts
 
@@ -279,7 +280,7 @@ def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, par
                 for expert_id in range(num_experts):
                     load_weight(param, w2[expert_id], expert_id=expert_id, shard_id='down')
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
@@ -366,8 +367,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype) -> None:
         self.dtype = dtype
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
diff --git a/lmdeploy/pytorch/models/interns1_pro_ts.py b/lmdeploy/pytorch/models/interns1_pro_ts.py
index 48ba00fcef..1bc4ecd910 100644
--- a/lmdeploy/pytorch/models/interns1_pro_ts.py
+++ b/lmdeploy/pytorch/models/interns1_pro_ts.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import math
-from typing import Optional, Tuple, Union
 
 import torch
 from torch import nn
@@ -224,7 +223,7 @@ def forward_encoder(self, x):
         # conv1
         # treat each channel as an independent sample and feed it into conv1
         x = x.reshape(num_patch * C, 1, patch_len)
-        x = nn.functional.relu((self.conv(x)))  # [B*C, D1, L]
+        x = nn.functional.relu(self.conv(x))  # [B*C, D1, L]
         x = x.permute(2, 0, 1)  # [L, B*C, D1]
 
         x = self.pos_encoder(x)  # [L, B*C, D1]
@@ -272,11 +271,11 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
 
     def forward(
         self,
-        time_series_signals: Optional[torch.FloatTensor] = None,
-        ts_lens: Optional[torch.Tensor] = None,
-        sr: Optional[torch.Tensor] = None,
-        time_series_embeds: Optional[torch.FloatTensor] = None,
-    ) -> Union[Tuple]:
+        time_series_signals: torch.FloatTensor | None = None,
+        ts_lens: torch.Tensor | None = None,
+        sr: torch.Tensor | None = None,
+        time_series_embeds: torch.FloatTensor | None = None,
+    ) -> tuple:
         if time_series_signals is None and time_series_embeds is None:
             raise ValueError('You have to specify time_series_signals or time_series_embeds')
 
diff --git a/lmdeploy/pytorch/models/internvl.py b/lmdeploy/pytorch/models/internvl.py
index 5b6c261dd2..5f4103fe92 100644
--- a/lmdeploy/pytorch/models/internvl.py
+++ b/lmdeploy/pytorch/models/internvl.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -277,7 +278,7 @@ def post_rms_norm(self, q: torch.Tensor, k: torch.Tensor, variance: torch.Tensor
         eps = self.config.layer_norm_eps
         return post_rms_norm(q, k, self.q_norm.weight, self.k_norm.weight, variance, eps, self.embed_dim, dtype)
 
-    def qkv_norm(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def qkv_norm(self, q: torch.Tensor, k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         import lmdeploy.pytorch.distributed as dist
         q_shape = q.shape
         k_shape = k.shape
@@ -432,7 +433,7 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
 
     def forward(
         self,
-        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values: torch.FloatTensor | None = None,
     ):
         """forward."""
         assert pixel_values.dim() == 4
@@ -713,7 +714,7 @@ def extract_and_compress(self, pixel_values: torch.Tensor, input_ids: torch.Tens
 
         return vit_embeds, new_lang_embeds, new_input_ids, new_image_mask, new_seq_lengths
 
-    def update_forward_inputs(self, input_ids: torch.Tensor, new_seqlens: List[int],
+    def update_forward_inputs(self, input_ids: torch.Tensor, new_seqlens: list[int],
                               context: StepContext) -> StepContext:
         """Update the forward inputs, position_ids and attention metadata."""
         from lmdeploy.pytorch.model_inputs import ModelInputs
@@ -758,7 +759,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         pixel_values: torch.Tensor = None,
         image_mask: torch.Tensor = None,
@@ -808,7 +809,7 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         inputs_embeds: torch.Tensor = None,
         context: StepContext = None,
     ):
@@ -921,7 +922,7 @@ def prepare_inputs_for_generation(
                         image_token_id=image_token_id,
                         context=context)
 
-    def load_lora_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], adapter_id: int):
+    def load_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]], adapter_id: int):
         """Load lora weights."""
 
         if hasattr(self.language_model, 'load_lora_weights'):
@@ -931,7 +932,7 @@ def load_lora_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], adapter
 
             return load_lora_weights(weights, adapter_id)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
 
         lang_prefix = 'language_model.'
@@ -976,8 +977,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None:
         self.vision_token_num = self.num_patches // 4
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
diff --git a/lmdeploy/pytorch/models/internvl3_hf.py b/lmdeploy/pytorch/models/internvl3_hf.py
index 7cd4cd940c..9a13d3c227 100644
--- a/lmdeploy/pytorch/models/internvl3_hf.py
+++ b/lmdeploy/pytorch/models/internvl3_hf.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -218,7 +219,7 @@ def post_rms_norm(self, q: torch.Tensor, k: torch.Tensor, variance: torch.Tensor
         eps = self.config.layer_norm_eps
         return post_rms_norm(q, k, self.q_norm.weight, self.k_norm.weight, variance, eps, self.embed_dim, dtype)
 
-    def qkv_norm(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def qkv_norm(self, q: torch.Tensor, k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         import lmdeploy.pytorch.distributed as dist
         q_shape = q.shape
         k_shape = k.shape
@@ -387,7 +388,7 @@ def get_input_embeddings(self):
 
     def forward(
         self,
-        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values: torch.FloatTensor | None = None,
     ):
         """forward."""
         assert pixel_values.dim() == 4
@@ -493,7 +494,7 @@ def get_input_embeddings(self):
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
-        vision_feature_layer: Union[int, List[int]],
+        vision_feature_layer: int | list[int],
         vision_feature_select_strategy: str,
         **kwargs,
     ):
@@ -503,7 +504,7 @@ def get_image_features(
         Args:
             pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
                The tensors corresponding to the input images.
-            vision_feature_layer (`int` or `List[int]`):
+            vision_feature_layer (`int` or `list[int]`):
                 Layer index or list of layer indices to extract features from.
         Returns:
             vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
@@ -574,7 +575,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         pixel_values: torch.Tensor = None,
         image_mask: torch.Tensor = None,
@@ -610,7 +611,7 @@ def forward(
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         inputs_embeds: torch.Tensor = None,
         context: StepContext = None,
     ):
@@ -653,7 +654,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_lora_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], adapter_id: int):
+    def load_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]], adapter_id: int):
         """Load lora weights."""
 
         if hasattr(self.model.language_model, 'load_lora_weights'):
@@ -674,7 +675,7 @@ def rename_weight(cls, name: str) -> str:
             return name[len('model.'):]
         return name
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
 
         lang_prefix = 'language_model.'
@@ -720,8 +721,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None:
         self.dtype = dtype
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
diff --git a/lmdeploy/pytorch/models/internvl_patch.py b/lmdeploy/pytorch/models/internvl_patch.py
index 5f25c0dd85..1a53bc68ce 100644
--- a/lmdeploy/pytorch/models/internvl_patch.py
+++ b/lmdeploy/pytorch/models/internvl_patch.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
 
 import torch
 import torch.nn.functional as F
@@ -65,7 +64,7 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
 
     def forward(
         self,
-        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values: torch.FloatTensor | None = None,
     ):
         if len(pixel_values.shape) != 4:
             raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
diff --git a/lmdeploy/pytorch/models/llama.py b/lmdeploy/pytorch/models/llama.py
index 6ab07b9c11..c7fd6354ac 100644
--- a/lmdeploy/pytorch/models/llama.py
+++ b/lmdeploy/pytorch/models/llama.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -8,8 +9,13 @@
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_down_linear,
+    build_gateup_linear,
+    build_o_proj,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
@@ -63,8 +69,8 @@ def __init__(self, config: LlamaConfig, dtype: torch.dtype = None, device: torch
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -176,9 +182,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -223,7 +229,7 @@ def __init__(self, config: LlamaConfig, dtype: torch.dtype = None, device: torch
             LlamaDecoderLayer(config, layer_idx, dtype=dtype, device=device)
             for layer_idx in range(config.num_hidden_layers)
         ])
-        self.aux_hidden_state_layers: Tuple[int] = getattr(config, 'aux_hidden_state_layers', tuple())
+        self.aux_hidden_state_layers: tuple[int] = getattr(config, 'aux_hidden_state_layers', tuple())
         # build norm
         self.norm = RMSNorm(config.hidden_size, config.rms_norm_eps, dtype=dtype, device=device)
 
@@ -233,10 +239,10 @@ def __init__(self, config: LlamaConfig, dtype: torch.dtype = None, device: torch
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -317,7 +323,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -346,7 +352,7 @@ def get_input_embeddings(self):
         """Get input embeddings."""
         return self.model.get_input_embeddings()
 
-    def get_outputs_cudagraph(self, output_buffers: Dict[str, torch.Tensor], input_ids: torch.Tensor, **kwargs):
+    def get_outputs_cudagraph(self, output_buffers: dict[str, torch.Tensor], input_ids: torch.Tensor, **kwargs):
         """Get outputs from buffers."""
         num_tokens = input_ids.size(-1)
         outputs = dict()
@@ -357,8 +363,8 @@ def get_outputs_cudagraph(self, output_buffers: Dict[str, torch.Tensor], input_i
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -384,7 +390,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/llama4.py b/lmdeploy/pytorch/models/llama4.py
index 4b3c2196bc..5ffc42c8e9 100644
--- a/lmdeploy/pytorch/models/llama4.py
+++ b/lmdeploy/pytorch/models/llama4.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -10,8 +11,12 @@
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
-from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_colwise_linear,
+    build_merged_colwise_linear,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.nn.moe import build_fused_moe
 from lmdeploy.pytorch.nn.rotary_embedding import get_rope_theta
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
@@ -77,8 +82,8 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """forward."""
@@ -271,9 +276,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
         """forward."""
@@ -331,7 +336,7 @@ def forward(
         self,
         inputs_embeds: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         **kwargs,
     ):
@@ -382,7 +387,7 @@ def forward(
         self,
         inputs_embeds: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         **kwargs,
     ):
@@ -482,7 +487,7 @@ def vision_apply_rotary_emb(
     query: torch.Tensor,
     key: torch.Tensor,
     freqs_ci: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     query_ = torch.view_as_complex(query.float().reshape(*query.shape[:-1], -1, 2))
     key_ = torch.view_as_complex(key.float().reshape(*key.shape[:-1], -1, 2))
     freqs_ci = reshape_for_broadcast(freqs_ci=freqs_ci, query=query_)  # freqs_ci[:,:,None,:]
@@ -849,7 +854,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         pixel_values: torch.FloatTensor = None,
         image_mask: torch.Tensor = None,
@@ -882,8 +887,8 @@ def get_logits(self, hidden_states: torch.Tensor):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -918,7 +923,7 @@ def prepare_inputs_for_generation(
             image_mask=image_mask,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
 
         def _load_experts_bf16(name, loaded_weight):
@@ -1016,8 +1021,8 @@ def __init__(self, config: Llama4Config, dtype) -> None:
         self.vision_config = config.vision_config
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
 
diff --git a/lmdeploy/pytorch/models/llama_eagle.py b/lmdeploy/pytorch/models/llama_eagle.py
index d581e1ff3d..02ddd91d10 100644
--- a/lmdeploy/pytorch/models/llama_eagle.py
+++ b/lmdeploy/pytorch/models/llama_eagle.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -62,11 +63,11 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        previous_hidden_states: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        previous_hidden_states: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
         # token embedding
@@ -126,7 +127,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         target_hidden_states: torch.Tensor = None,
@@ -145,8 +146,8 @@ def forward(
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -205,7 +206,7 @@ def get_input_embeddings(self):
         """Get input embeddings."""
         return self.model.get_input_embeddings()
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
diff --git a/lmdeploy/pytorch/models/llama_eagle3.py b/lmdeploy/pytorch/models/llama_eagle3.py
index b37a2337bf..eb683c52ac 100644
--- a/lmdeploy/pytorch/models/llama_eagle3.py
+++ b/lmdeploy/pytorch/models/llama_eagle3.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -51,8 +52,8 @@ def forward(
         self,
         embeds: torch.Tensor,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
         attn_metadata: Any = None,
     ):
 
@@ -109,11 +110,11 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        previous_hidden_states: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        previous_hidden_states: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
         # token embedding
@@ -189,7 +190,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         target_hidden_states: torch.Tensor = None,
@@ -208,8 +209,8 @@ def forward(
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -262,7 +263,7 @@ def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta, **kwargs):
 
         return new_inputs
 
-    def get_outputs_cudagraph(self, output_buffers: Dict[str, torch.Tensor], input_ids: torch.Tensor, **kwargs):
+    def get_outputs_cudagraph(self, output_buffers: dict[str, torch.Tensor], input_ids: torch.Tensor, **kwargs):
         """Get outputs from buffers."""
         num_tokens = input_ids.size(-1)
         outputs = dict()
@@ -274,7 +275,7 @@ def get_input_embeddings(self):
         """Get input embeddings."""
         return self.model.get_input_embeddings()
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
diff --git a/lmdeploy/pytorch/models/llava.py b/lmdeploy/pytorch/models/llava.py
index e87242df4c..cb14bc443d 100644
--- a/lmdeploy/pytorch/models/llava.py
+++ b/lmdeploy/pytorch/models/llava.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -174,8 +175,8 @@ def __init__(self, config, dtype: torch.dtype = None, device: torch.device = Non
     def forward(
         self,
         hidden_states,
-        attention_mask: Optional[torch.Tensor] = None,
-        causal_attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: torch.Tensor | None = None,
+        causal_attention_mask: torch.Tensor | None = None,
     ):
         """forward."""
         # qkv proj
@@ -287,8 +288,8 @@ def __init__(self, config, dtype: torch.dtype = None, device: torch.device = Non
     def forward(
         self,
         inputs_embeds,
-        attention_mask: Optional[torch.Tensor] = None,
-        causal_attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: torch.Tensor | None = None,
+        causal_attention_mask: torch.Tensor | None = None,
         vision_feature_layer: int = -1,
     ):
         """forward."""
@@ -414,7 +415,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         pixel_values: torch.Tensor = None,
         image_mask: torch.Tensor = None,
@@ -449,7 +450,7 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         inputs_embeds: torch.Tensor = None,
         context: StepContext = None,
     ):
@@ -492,7 +493,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
 
         stacked_params_mapping = [
@@ -539,8 +540,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None:
         self.dtype = dtype
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
@@ -721,7 +722,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         pixel_values: torch.Tensor = None,
         image_sizes: torch.Tensor = None,
@@ -762,7 +763,7 @@ def get_input_processor(self) -> BaseModelInputProcessor:
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         inputs_embeds: torch.Tensor = None,
         context: StepContext = None,
     ):
@@ -817,8 +818,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None:
         self.dtype = dtype
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
diff --git a/lmdeploy/pytorch/models/minicpm3.py b/lmdeploy/pytorch/models/minicpm3.py
index 6513a3ff46..9dec51f076 100644
--- a/lmdeploy/pytorch/models/minicpm3.py
+++ b/lmdeploy/pytorch/models/minicpm3.py
@@ -1,7 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import math
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -11,8 +12,12 @@
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
 from lmdeploy.pytorch.nn.linear import build_colwise_linear, build_merged_colwise_linear, build_rowwise_linear
-from lmdeploy.pytorch.nn.rotary_embedding import (ApplyRotaryEmb, LongRoPEScalingParameters, get_rope_parameters,
-                                                  get_rope_theta)
+from lmdeploy.pytorch.nn.rotary_embedding import (
+    ApplyRotaryEmb,
+    LongRoPEScalingParameters,
+    get_rope_parameters,
+    get_rope_theta,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
@@ -107,8 +112,8 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -243,8 +248,8 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
         attn_metadata: Any = None,
     ):
 
@@ -328,10 +333,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -396,7 +401,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -424,8 +429,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -451,7 +456,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/minicpmv26.py b/lmdeploy/pytorch/models/minicpmv26.py
index fa049bbb98..f72549f7dc 100644
--- a/lmdeploy/pytorch/models/minicpmv26.py
+++ b/lmdeploy/pytorch/models/minicpmv26.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -60,8 +61,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -171,9 +172,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -228,10 +229,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -299,7 +300,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -330,8 +331,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -357,7 +358,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/mistral.py b/lmdeploy/pytorch/models/mistral.py
index 4c8304400f..e3baee0417 100644
--- a/lmdeploy/pytorch/models/mistral.py
+++ b/lmdeploy/pytorch/models/mistral.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -8,8 +9,13 @@
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_down_linear,
+    build_gateup_linear,
+    build_o_proj,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
@@ -64,8 +70,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -175,9 +181,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -232,10 +238,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -308,7 +314,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -333,8 +339,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -360,7 +366,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/mixtral.py b/lmdeploy/pytorch/models/mixtral.py
index 434a31f234..f770877f91 100644
--- a/lmdeploy/pytorch/models/mixtral.py
+++ b/lmdeploy/pytorch/models/mixtral.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -62,8 +63,8 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -178,9 +179,9 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -231,10 +232,10 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """forward."""
         if inputs_embeds is None:
@@ -288,7 +289,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -312,8 +313,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -329,7 +330,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/patch.py b/lmdeploy/pytorch/models/patch.py
index 0842904129..92d29e1d13 100644
--- a/lmdeploy/pytorch/models/patch.py
+++ b/lmdeploy/pytorch/models/patch.py
@@ -6,7 +6,7 @@
 import os.path as osp
 import re
 import sys
-from typing import Any, Dict
+from typing import Any
 
 import torch
 from transformers.configuration_utils import PretrainedConfig
@@ -21,7 +21,7 @@
 logger = get_logger('lmdeploy')
 
 
-def _get_rewrite_qualname(origin_qualname: str, module_map: Dict[str, str]) -> str:
+def _get_rewrite_qualname(origin_qualname: str, module_map: dict[str, str]) -> str:
     """Get rewrite module from origin module name.
 
     Args:
@@ -58,7 +58,7 @@ def _class_from_qualname(qualname: str) -> Any:
     return cls_type
 
 
-def _find_rewrite_module_qualname(model, module_map: Dict[str, str]):
+def _find_rewrite_module_qualname(model, module_map: dict[str, str]):
     """Find rewrite module."""
     module_name = inspect.getmodule(model).__name__
     class_name = model.__class__.__name__
@@ -93,7 +93,7 @@ def _find_submodulename():
     return rewrite_qualname
 
 
-def get_rewrite_cls(model: torch.nn.Module, module_map: Dict[str, str] = None):
+def get_rewrite_cls(model: torch.nn.Module, module_map: dict[str, str] = None):
     """Get rewrite cls."""
     if module_map is None:
         module_map = _get_module_map()
@@ -133,13 +133,13 @@ def update_custom_module_map(module_map_path: str):
     if hasattr(custom_mod, 'MODULE_MAP'):
         has_map = True
         mod_map = custom_mod.MODULE_MAP
-        assert isinstance(mod_map, Dict)
+        assert isinstance(mod_map, dict)
         new_mod_map.update(mod_map)
 
     if hasattr(custom_mod, 'CUSTOM_MODULE_MAP'):
         has_map = True
         mod_map = custom_mod.CUSTOM_MODULE_MAP
-        assert isinstance(mod_map, Dict)
+        assert isinstance(mod_map, dict)
         new_mod_map.update(mod_map)
 
     if not has_map:
@@ -216,7 +216,7 @@ def build_patched_model(config: ModelConfig, device: torch.device = None, build_
 
 @torch.inference_mode()
 def add_adapters(model: torch.nn.Module,
-                 adapters: Dict[str, str],
+                 adapters: dict[str, str],
                  dtype: torch.dtype = torch.float16,
                  device: torch.device = None):
     """Add adapters."""
diff --git a/lmdeploy/pytorch/models/phi3.py b/lmdeploy/pytorch/models/phi3.py
index c49b01737d..2f9a34a866 100644
--- a/lmdeploy/pytorch/models/phi3.py
+++ b/lmdeploy/pytorch/models/phi3.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -64,8 +65,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -175,9 +176,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -232,10 +233,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -299,7 +300,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -320,8 +321,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -347,7 +348,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
 
diff --git a/lmdeploy/pytorch/models/phi3_moe.py b/lmdeploy/pytorch/models/phi3_moe.py
index 2f60b16788..942e312757 100644
--- a/lmdeploy/pytorch/models/phi3_moe.py
+++ b/lmdeploy/pytorch/models/phi3_moe.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -9,8 +10,12 @@
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, LayerNorm, RopeType
 from lmdeploy.pytorch.nn.linear import build_qkv_proj, build_rowwise_linear
 from lmdeploy.pytorch.nn.moe import build_fused_moe
-from lmdeploy.pytorch.nn.rotary_embedding import (LongRoPEScalingParameters, build_rotary_embedding,
-                                                  get_rope_parameters, get_rope_theta)
+from lmdeploy.pytorch.nn.rotary_embedding import (
+    LongRoPEScalingParameters,
+    build_rotary_embedding,
+    get_rope_parameters,
+    get_rope_theta,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
@@ -112,8 +117,8 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -223,9 +228,9 @@ def __init__(self, config: Any, layer_idx: int, dtype: torch.dtype = None, devic
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
         if residual is None:
@@ -304,10 +309,10 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """forward."""
         if inputs_embeds is None:
@@ -360,7 +365,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -384,8 +389,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -401,7 +406,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/phi3_v.py b/lmdeploy/pytorch/models/phi3_v.py
index c6804d5586..0e83a8edf0 100644
--- a/lmdeploy/pytorch/models/phi3_v.py
+++ b/lmdeploy/pytorch/models/phi3_v.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -236,13 +237,13 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        image_sizes: Optional[torch.LongTensor] = None,
+        pixel_values: torch.FloatTensor | None = None,
+        image_sizes: torch.LongTensor | None = None,
         image_mask: torch.Tensor = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -284,7 +285,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         pixel_values: torch.Tensor = None,
         image_sizes: torch.Tensor = None,
@@ -307,7 +308,7 @@ def forward(
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         inputs_embeds: torch.Tensor = None,
         context: StepContext = None,
     ):
@@ -333,7 +334,7 @@ def prepare_inputs_for_generation(
 
         return output
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         import itertools
 
@@ -362,8 +363,8 @@ def __init__(self, config: PretrainedConfig, dtype) -> None:
         self.dtype = dtype
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
diff --git a/lmdeploy/pytorch/models/q_modules.py b/lmdeploy/pytorch/models/q_modules.py
index 36f9506327..c3ad77db34 100644
--- a/lmdeploy/pytorch/models/q_modules.py
+++ b/lmdeploy/pytorch/models/q_modules.py
@@ -5,8 +5,12 @@
 import torch
 import torch.nn as nn
 
-from ..kernels.w8a8_triton_kernels import (matmul_kernel_dynamic_quant, per_channel_quant, per_token_quant_int8,
-                                           rms_norm_dynamic_quant)
+from ..kernels.w8a8_triton_kernels import (
+    matmul_kernel_dynamic_quant,
+    per_channel_quant,
+    per_token_quant_int8,
+    rms_norm_dynamic_quant,
+)
 
 
 @dataclass
diff --git a/lmdeploy/pytorch/models/qwen.py b/lmdeploy/pytorch/models/qwen.py
index 6f7020abf4..650222c4b3 100644
--- a/lmdeploy/pytorch/models/qwen.py
+++ b/lmdeploy/pytorch/models/qwen.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -8,8 +9,13 @@
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, RopeType, SiluAndMul, build_rotary_embedding
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_down_linear,
+    build_gateup_linear,
+    build_o_proj,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
@@ -65,8 +71,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -183,9 +189,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -245,10 +251,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """forward."""
 
@@ -317,7 +323,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -342,8 +348,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -369,7 +375,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/qwen2.py b/lmdeploy/pytorch/models/qwen2.py
index ddd5b4dec9..963c5d33b4 100644
--- a/lmdeploy/pytorch/models/qwen2.py
+++ b/lmdeploy/pytorch/models/qwen2.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -61,8 +62,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -172,9 +173,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -231,10 +232,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -303,7 +304,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -324,8 +325,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -351,7 +352,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/qwen2_5_vl.py b/lmdeploy/pytorch/models/qwen2_5_vl.py
index 9c19a7de21..cb547a5793 100644
--- a/lmdeploy/pytorch/models/qwen2_5_vl.py
+++ b/lmdeploy/pytorch/models/qwen2_5_vl.py
@@ -2,7 +2,8 @@
 # adapted from:
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -118,7 +119,7 @@ def __init__(self,
         )
 
     def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
-                rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor]) -> torch.Tensor:
+                rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor]) -> torch.Tensor:
         seq_length = hidden_states.shape[0]
         # qkv proj
         qkv_states = self.qkv(hidden_states)
@@ -198,7 +199,7 @@ def __init__(self,
     def forward(self,
                 hidden_states: torch.Tensor,
                 cu_seqlens: torch.Tensor,
-                rotary_pos_emb: Optional[torch.Tensor] = None) -> torch.Tensor:
+                rotary_pos_emb: torch.Tensor | None = None) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
@@ -341,7 +342,7 @@ def forward(self,
                 cu_seqlens: torch.Tensor,
                 rotary_pos_emb: torch.Tensor,
                 window_index: torch.Tensor = None,
-                cu_window_seqlens: List = None) -> torch.Tensor:
+                cu_window_seqlens: list = None) -> torch.Tensor:
         """forward."""
         hidden_states = self.patch_embed(hidden_states)
 
@@ -417,7 +418,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         mrope_position_ids: torch.Tensor = None,
@@ -425,7 +426,7 @@ def forward(
         vis_cu_seqlens: torch.Tensor = None,
         vis_pos_emb: torch.Tensor = None,
         window_index: torch.Tensor = None,
-        cu_window_seqlens: List = None,
+        cu_window_seqlens: list = None,
         image_mask: torch.Tensor = None,
         **kwargs,
     ):
@@ -458,8 +459,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -526,7 +527,7 @@ def prepare_inputs_for_generation(
             image_mask=image_mask,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
@@ -673,8 +674,8 @@ def _update_model_meta_prefilling(self, context: StepContext):
         return new_model_metas
 
     def update_model_metas(self,
-                           past_key_values: List[List[torch.Tensor]],
-                           inputs_embeds: Optional[torch.Tensor] = None,
+                           past_key_values: list[list[torch.Tensor]],
+                           inputs_embeds: torch.Tensor | None = None,
                            context: StepContext = None):
         """Update model meta."""
         if context.is_decoding:
@@ -687,7 +688,7 @@ def get_input_processor(self) -> BaseModelInputProcessor:
         return self.input_processor
 
 
-InputMultiModalType = List[Dict[str, Any]]
+InputMultiModalType = list[dict[str, Any]]
 
 
 class Qwen2_5_VLInputProcessor(BaseModelInputProcessor):
@@ -697,8 +698,8 @@ def __init__(self, config: PretrainedConfig) -> None:
         self.config = config
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
diff --git a/lmdeploy/pytorch/models/qwen2_moe.py b/lmdeploy/pytorch/models/qwen2_moe.py
index c86608fc12..aeb7b9efea 100644
--- a/lmdeploy/pytorch/models/qwen2_moe.py
+++ b/lmdeploy/pytorch/models/qwen2_moe.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -66,8 +67,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -274,9 +275,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -333,10 +334,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -405,7 +406,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -426,8 +427,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -453,8 +454,8 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                             expert_params_mapping: List):
+    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                             expert_params_mapping: list):
         """Load weight experts."""
         for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping:
             if weight_name not in name:
@@ -467,7 +468,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             param = params_dict[name]
             load_weight(param, loaded_weight)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/qwen2_reward.py b/lmdeploy/pytorch/models/qwen2_reward.py
index f65fa8fe7a..aaefcc36ed 100644
--- a/lmdeploy/pytorch/models/qwen2_reward.py
+++ b/lmdeploy/pytorch/models/qwen2_reward.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -54,7 +55,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -84,8 +85,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -103,7 +104,7 @@ def prepare_inputs_for_generation(
             # inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/qwen2_vl.py b/lmdeploy/pytorch/models/qwen2_vl.py
index 605f8ded76..1d3bfd0bc1 100644
--- a/lmdeploy/pytorch/models/qwen2_vl.py
+++ b/lmdeploy/pytorch/models/qwen2_vl.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Callable, Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -9,17 +10,28 @@
 from lmdeploy.pytorch.engine.input_process import BaseModelInputProcessor, PreprocessInputResult
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.multimodal.data_type import MultiModalTensor
-from lmdeploy.pytorch.nn import (ApplyRotaryEmb, Attention, FlashAttention, LayerNorm, RMSNorm, SiluAndMul,
-                                 build_rotary_embedding_from_config)
-from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn import (
+    ApplyRotaryEmb,
+    Attention,
+    FlashAttention,
+    LayerNorm,
+    RMSNorm,
+    SiluAndMul,
+    build_rotary_embedding_from_config,
+)
+from lmdeploy.pytorch.nn.linear import (
+    build_colwise_linear,
+    build_merged_colwise_linear,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMeta, CudaGraphMixin
 from .utils.model import DeployModelMixinV1, build_embedding, vlm_model
 
 
-def _apply_mrope_selection(hidden_states: torch.Tensor, mrope_position_ids: torch.Tensor, mrope_section: List[int],
+def _apply_mrope_selection(hidden_states: torch.Tensor, mrope_position_ids: torch.Tensor, mrope_section: list[int],
                            position_ids: torch.Tensor, rotary_emb_func: Callable):
     _mrope_position_ids = torch.zeros(3, position_ids.shape[-1], dtype=position_ids.dtype, device=position_ids.device)
     _mrope_position_ids[:, :mrope_position_ids.shape[-1]] = mrope_position_ids
@@ -87,8 +99,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -198,9 +210,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -258,10 +270,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         mrope_position_ids: torch.LongTensor = None,
     ):
         """Rewrite of LlamaModel.forward."""
@@ -393,7 +405,7 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
                                          is_tp=True)
 
     def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
-                rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor]) -> torch.Tensor:
+                rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor]) -> torch.Tensor:
         seq_length = hidden_states.shape[0]
         # qkv proj
         qkv_states = self.qkv(hidden_states)
@@ -480,7 +492,7 @@ def forward(self,
                 hidden_states,
                 cu_seqlens,
                 rotary_pos_emb,
-                residual: Optional[torch.Tensor] = None) -> torch.Tensor:
+                residual: torch.Tensor | None = None) -> torch.Tensor:
         if residual is None:
             residual = hidden_states
             hidden_states = self.norm1(hidden_states)
@@ -638,7 +650,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         mrope_position_ids: torch.Tensor = None,
@@ -674,8 +686,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -729,7 +741,7 @@ def prepare_inputs_for_generation(
             image_mask=image_mask,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
@@ -876,8 +888,8 @@ def _update_model_meta_prefilling(self, context: StepContext):
         return new_model_metas
 
     def update_model_metas(self,
-                           past_key_values: List[List[torch.Tensor]],
-                           inputs_embeds: Optional[torch.Tensor] = None,
+                           past_key_values: list[list[torch.Tensor]],
+                           inputs_embeds: torch.Tensor | None = None,
                            context: StepContext = None):
         """Update model meta."""
         if context.is_decoding:
@@ -890,7 +902,7 @@ def get_input_processor(self) -> BaseModelInputProcessor:
         return self.input_processor
 
 
-InputMultiModalType = List[Dict[str, Any]]
+InputMultiModalType = list[dict[str, Any]]
 
 
 class Qwen2VLInputProcessor(BaseModelInputProcessor):
@@ -900,8 +912,8 @@ def __init__(self, config: PretrainedConfig) -> None:
         self.config = config
 
     def preprocess_input(self,
-                         input_ids: List[int],
-                         input_multimodals: List[Dict[str, Any]] = None,
+                         input_ids: list[int],
+                         input_multimodals: list[dict[str, Any]] = None,
                          **kwargs) -> PreprocessInputResult:
         """Prepare multimodal input."""
         if input_multimodals is None or len(input_multimodals) == 0:
diff --git a/lmdeploy/pytorch/models/qwen3.py b/lmdeploy/pytorch/models/qwen3.py
index e69efefda5..11326d3225 100644
--- a/lmdeploy/pytorch/models/qwen3.py
+++ b/lmdeploy/pytorch/models/qwen3.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -77,8 +78,8 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -207,9 +208,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -274,10 +275,10 @@ def __init__(self,
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -347,7 +348,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -368,8 +369,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -395,7 +396,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/qwen3_5.py b/lmdeploy/pytorch/models/qwen3_5.py
index 59f373f075..8930acf978 100644
--- a/lmdeploy/pytorch/models/qwen3_5.py
+++ b/lmdeploy/pytorch/models/qwen3_5.py
@@ -1,7 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from collections.abc import Iterable
 from functools import lru_cache
-from typing import Any, Iterable, List, Tuple
+from typing import Any
 
 import numpy as np
 import torch
@@ -16,8 +17,13 @@
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, LayerNorm, RMSNorm, SiluAndMul
 from lmdeploy.pytorch.nn.gated_delta import CausalConv1d, GatedDelta, GatedDeltaMeta, build_rmsnorm_gated
-from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_colwise_linear,
+    build_merged_colwise_linear,
+    build_o_proj,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters
 from lmdeploy.pytorch.weight_loader.model_weight_loader import default_weight_loader, load_weight
 
@@ -243,7 +249,7 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
         return rotary_pos_emb
 
     # copy from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen3_vl.py#L474
-    def fast_pos_embed_interpolate(self, grid_thw: List[List[int]]) -> torch.Tensor:
+    def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
         num_grid_per_side = self.num_grid_per_side
         m_size = self.spatial_merge_size
         hidden_dim = self.pos_embed.embedding_dim
@@ -486,14 +492,14 @@ def fix_zba_ordering(self, mixed_zba: torch.Tensor):
         z = z.unflatten(-1, (-1, self.head_v_dim))
         return z, b, a
 
-    def _load_state(self, past_key_value: Tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta):
+    def _load_state(self, past_key_value: tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta):
         """Load states from cache."""
         return gated_delta_util.load_state(past_key_value=past_key_value, gated_delta_meta=gated_delta_meta)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        past_key_value: Tuple[torch.Tensor, torch.Tensor],
+        past_key_value: tuple[torch.Tensor, torch.Tensor],
         gated_delta_meta: GatedDeltaMeta,
     ):
         """forward."""
@@ -629,8 +635,8 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Tuple[torch.Tensor, torch.Tensor],
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor, torch.Tensor],
         attn_metadata: Any,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -727,8 +733,8 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: List[torch.FloatTensor],
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor],
         residual: torch.Tensor | None,
         attn_metadata: Any,
         gated_delta_meta: GatedDeltaMeta,
@@ -896,7 +902,7 @@ def forward(
         self,
         input_ids: torch.LongTensor,
         position_ids: torch.LongTensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any,
         state_ids: torch.Tensor,
         inputs_embeds: torch.Tensor | None = None,
@@ -970,7 +976,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any,
         state_ids: torch.Tensor,
         inputs_embeds: torch.Tensor | None = None,
@@ -1068,7 +1074,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any,
         state_ids: torch.Tensor,
         inputs_embeds: torch.Tensor | None = None,
@@ -1115,7 +1121,7 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         inputs_embeds: torch.Tensor | None = None,
         context: StepContext | None = None,
     ):
@@ -1188,7 +1194,7 @@ def prepare_inputs_for_generation(
             pos_embeds=pos_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
 
         def __skip_layers(name):
@@ -1374,7 +1380,7 @@ def _update_model_meta_prefilling(self, context: StepContext):
 
         return new_model_metas
 
-    def update_model_metas(self, past_key_values: List[List[torch.Tensor]], inputs_embeds: torch.Tensor | None,
+    def update_model_metas(self, past_key_values: list[list[torch.Tensor]], inputs_embeds: torch.Tensor | None,
                            context: StepContext):
         """Update model meta."""
         if context.is_decoding:
diff --git a/lmdeploy/pytorch/models/qwen3_5_moe.py b/lmdeploy/pytorch/models/qwen3_5_moe.py
index c475b873a5..3dca50cdc8 100644
--- a/lmdeploy/pytorch/models/qwen3_5_moe.py
+++ b/lmdeploy/pytorch/models/qwen3_5_moe.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Dict, Iterable, List, Tuple
+from collections.abc import Iterable
 
 import torch
 import torch.distributed as dist
@@ -16,8 +16,16 @@
 
 from .patch import add_prefix, get_build_model_context
 from .qwen2_5_vl import Qwen2_5_VLInputProcessor as Qwen3_5MoeInputProcessor
-from .qwen3_5 import (Qwen3_5Attention, Qwen3_5DecoderLayer, Qwen3_5ForConditionalGeneration, Qwen3_5GatedDeltaNet,
-                      Qwen3_5MLP, Qwen3_5Model, Qwen3_5TextModel, Qwen3_5TextRotaryEmbedding)
+from .qwen3_5 import (
+    Qwen3_5Attention,
+    Qwen3_5DecoderLayer,
+    Qwen3_5ForConditionalGeneration,
+    Qwen3_5GatedDeltaNet,
+    Qwen3_5MLP,
+    Qwen3_5Model,
+    Qwen3_5TextModel,
+    Qwen3_5TextRotaryEmbedding,
+)
 from .qwen3_5 import Qwen3_5VisionModel as Qwen3_5MoeVisionModel
 
 
@@ -265,8 +273,8 @@ def __init__(self,
         bm_ctx = get_build_model_context()
         self.enable_return_routed_experts = bm_ctx.enable_return_routed_experts
 
-    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                             expert_params_mapping: List):
+    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                             expert_params_mapping: list):
         """Load weight experts."""
         # this func is not used, but it has same layout with tranformers implementation
         # so I will keep it for now.
@@ -282,7 +290,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             param = params_dict[name]
             load_weight(param, loaded_weight)
 
-    def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter]):
+    def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter]):
         """Load weight of fused expert weights."""
         num_experts = self.config.text_config.num_experts
         fused_gateup_name = 'gate_up_proj'
@@ -305,7 +313,7 @@ def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, par
                 w2 = loaded_weight[expert_id]
                 load_weight(param, w2, expert_id=expert_id, shard_id='down')
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
 
         def __skip_layers(name):
diff --git a/lmdeploy/pytorch/models/qwen3_moe.py b/lmdeploy/pytorch/models/qwen3_moe.py
index 5157199175..158cc5ecd3 100644
--- a/lmdeploy/pytorch/models/qwen3_moe.py
+++ b/lmdeploy/pytorch/models/qwen3_moe.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -94,8 +95,8 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -317,9 +318,9 @@ def __init__(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
         all_routed_experts: torch.Tensor = None,
     ):
@@ -396,10 +397,10 @@ def __init__(self,
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         all_routed_experts: torch.Tensor = None,
     ):
         """Rewrite of LlamaModel.forward."""
@@ -483,7 +484,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -518,8 +519,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -545,8 +546,8 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                             expert_params_mapping: List):
+    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                             expert_params_mapping: list):
         """Load weight experts."""
         # load fused weights
         if any([k in name for k in ['fused_w1w3', 'fused_w2']]):
@@ -563,7 +564,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             param = params_dict[name]
             load_weight(param, loaded_weight)
 
-    def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter]):
+    def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter]):
         """Load weight of fused expert weights."""
         num_experts = self.config.num_experts
         fused_gateup_name = 'fused_w1w3'
@@ -588,7 +589,7 @@ def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, par
                 w2 = loaded_weight.narrow(dim=0, start=chunk_size * expert_id, length=chunk_size)
                 load_weight(param, w2, expert_id=expert_id, shard_id='down')
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/qwen3_next.py b/lmdeploy/pytorch/models/qwen3_next.py
index 4c56c01aa3..3f70dc5cf9 100644
--- a/lmdeploy/pytorch/models/qwen3_next.py
+++ b/lmdeploy/pytorch/models/qwen3_next.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 import torch.nn.functional as F
@@ -13,8 +14,13 @@
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
 from lmdeploy.pytorch.nn.gated_delta import CausalConv1d, GatedDelta, GatedDeltaMeta, build_rmsnorm_gated
-from lmdeploy.pytorch.nn.linear import (build_colwise_linear, build_merged_colwise_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_colwise_linear,
+    build_merged_colwise_linear,
+    build_o_proj,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.nn.moe import SoftmaxTopK, build_fused_moe
 from lmdeploy.pytorch.weight_loader.model_weight_loader import default_weight_loader, load_weight
 
@@ -142,14 +148,14 @@ def fix_query_key_value_ordering(self, mixed_qkvz: torch.Tensor, mixed_ba: torch
         a = a.float().flatten(-2, -1)
         return mixed_qkv, z, b, a
 
-    def _load_state(self, past_key_value: Tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta):
+    def _load_state(self, past_key_value: tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta):
         """Load states from cache."""
         return gated_delta_util.load_state(past_key_value=past_key_value, gated_delta_meta=gated_delta_meta)
 
     def forward(
         self,
         hidden_states: torch.Tensor,
-        past_key_value: Tuple[torch.Tensor, torch.Tensor],
+        past_key_value: tuple[torch.Tensor, torch.Tensor],
         gated_delta_meta: GatedDeltaMeta,
     ):
         """forward."""
@@ -268,8 +274,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -487,9 +493,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor],
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None,
         attn_metadata: Any,
         gated_delta_meta: GatedDeltaMeta,
     ):
@@ -556,10 +562,10 @@ def forward(
         self,
         input_ids: torch.LongTensor,
         position_ids: torch.LongTensor,
-        past_key_values: List[torch.FloatTensor],
+        past_key_values: list[torch.FloatTensor],
         attn_metadata: Any,
         state_ids: torch.Tensor,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -632,7 +638,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         state_ids: torch.Tensor = None,
@@ -655,8 +661,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -717,8 +723,8 @@ def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta, **kwargs):
 
         return new_inputs
 
-    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                             expert_params_mapping: List):
+    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                             expert_params_mapping: list):
         """Load weight experts."""
         # load fused weights
         for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping:
@@ -732,7 +738,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             param = params_dict[name]
             load_weight(param, loaded_weight)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
 
         def __skip_layers(name):
diff --git a/lmdeploy/pytorch/models/qwen3_vl.py b/lmdeploy/pytorch/models/qwen3_vl.py
index a6f694c6f2..227c1c71fe 100644
--- a/lmdeploy/pytorch/models/qwen3_vl.py
+++ b/lmdeploy/pytorch/models/qwen3_vl.py
@@ -1,7 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+from collections.abc import Iterable
 from functools import lru_cache
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any
 
 import numpy as np
 import torch
@@ -116,14 +117,14 @@ def __init__(self,
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         mrope_position_ids: torch.LongTensor = None,
         # args for deepstack
-        visual_pos_masks: Optional[torch.Tensor] = None,
-        deepstack_visual_embeds: Optional[list[torch.Tensor]] = None,
+        visual_pos_masks: torch.Tensor | None = None,
+        deepstack_visual_embeds: list[torch.Tensor] | None = None,
     ):
         """visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`,
         *optional*):
@@ -279,7 +280,7 @@ def __init__(
     def forward(self,
                 hidden_states: torch.Tensor,
                 cu_seqlens: torch.Tensor,
-                rotary_pos_emb: Optional[torch.Tensor] = None) -> torch.Tensor:
+                rotary_pos_emb: torch.Tensor | None = None) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
@@ -419,7 +420,7 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
         return rotary_pos_emb
 
     # copy from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen3_vl.py#L474
-    def fast_pos_embed_interpolate(self, grid_thw: List[List[int]]) -> torch.Tensor:
+    def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
         num_grid_per_side = self.num_grid_per_side
         m_size = self.spatial_merge_size
         hidden_dim = self.pos_embed.embedding_dim
@@ -549,7 +550,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         mrope_position_ids: torch.Tensor = None,
@@ -609,8 +610,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -680,7 +681,7 @@ def rename_weight(cls, name: str) -> str:
             return name[len('model.'):]
         return name
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
@@ -827,8 +828,8 @@ def _update_model_meta_prefilling(self, context: StepContext):
         return new_model_metas
 
     def update_model_metas(self,
-                           past_key_values: List[List[torch.Tensor]],
-                           inputs_embeds: Optional[torch.Tensor] = None,
+                           past_key_values: list[list[torch.Tensor]],
+                           inputs_embeds: torch.Tensor | None = None,
                            context: StepContext = None):
         """Update model meta."""
         if context.is_decoding:
@@ -841,4 +842,4 @@ def get_input_processor(self) -> BaseModelInputProcessor:
         return self.input_processor
 
 
-InputMultiModalType = List[Dict[str, Any]]
+InputMultiModalType = list[dict[str, Any]]
diff --git a/lmdeploy/pytorch/models/qwen3_vl_moe.py b/lmdeploy/pytorch/models/qwen3_vl_moe.py
index 5810ab9b11..f6c0410840 100644
--- a/lmdeploy/pytorch/models/qwen3_vl_moe.py
+++ b/lmdeploy/pytorch/models/qwen3_vl_moe.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -35,14 +36,14 @@ def __init__(self,
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
         mrope_position_ids: torch.LongTensor = None,
         # args for deepstack
-        visual_pos_masks: Optional[torch.Tensor] = None,
-        deepstack_visual_embeds: Optional[list[torch.Tensor]] = None,
+        visual_pos_masks: torch.Tensor | None = None,
+        deepstack_visual_embeds: list[torch.Tensor] | None = None,
     ):
         """visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`,
         *optional*):
@@ -136,8 +137,8 @@ def __init__(
                                                   device=device,
                                                   prefix=add_prefix('language_model', prefix))
 
-    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                             expert_params_mapping: List):
+    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                             expert_params_mapping: list):
         """Load weight experts."""
 
         for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping:
@@ -152,8 +153,8 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             load_weight(param, loaded_weight)
 
     # modify from vllm qwen3vlmoe fused expert loading
-    def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                                   fused_expert_params_mapping: List):
+    def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                                   fused_expert_params_mapping: list):
         """Load weight of fused expert weights."""
         num_experts = self.config.text_config.num_experts
 
@@ -176,7 +177,7 @@ def _load_weight_fused_experts(self, name: str, loaded_weight: torch.Tensor, par
                 for expert_id in range(num_experts):
                     load_weight(param, w2[expert_id], expert_id=expert_id, shard_id='down')
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/sdar.py b/lmdeploy/pytorch/models/sdar.py
index 6a624e40e4..7ed9016cae 100644
--- a/lmdeploy/pytorch/models/sdar.py
+++ b/lmdeploy/pytorch/models/sdar.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -8,8 +9,13 @@
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_down_linear,
+    build_gateup_linear,
+    build_o_proj,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
 from .utils.cudagraph import CudaGraphMixin
@@ -68,8 +74,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -181,9 +187,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -238,10 +244,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of forward."""
 
@@ -315,7 +321,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -345,8 +351,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -372,7 +378,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/sdar_moe.py b/lmdeploy/pytorch/models/sdar_moe.py
index 3573ea9500..a52fb42de9 100644
--- a/lmdeploy/pytorch/models/sdar_moe.py
+++ b/lmdeploy/pytorch/models/sdar_moe.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -8,8 +9,13 @@
 
 from lmdeploy.pytorch.model_inputs import StepContext, StepContextManager
 from lmdeploy.pytorch.nn import ApplyRotaryEmb, Attention, RMSNorm, SiluAndMul, build_rotary_embedding_from_config
-from lmdeploy.pytorch.nn.linear import (build_down_linear, build_gateup_linear, build_o_proj, build_qkv_proj,
-                                        build_rowwise_linear)
+from lmdeploy.pytorch.nn.linear import (
+    build_down_linear,
+    build_gateup_linear,
+    build_o_proj,
+    build_qkv_proj,
+    build_rowwise_linear,
+)
 from lmdeploy.pytorch.nn.moe import SoftmaxTopK, build_fused_moe
 from lmdeploy.pytorch.weight_loader.model_weight_loader import load_weight
 
@@ -69,8 +75,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -250,9 +256,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
 
@@ -307,10 +313,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of forward."""
 
@@ -384,7 +390,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -414,8 +420,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -441,8 +447,8 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: Dict[str, nn.Parameter],
-                             expert_params_mapping: List):
+    def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_dict: dict[str, nn.Parameter],
+                             expert_params_mapping: list):
         """Load weight experts."""
         # load fused weights
         for (param_name, weight_name, expert_id, shard_id) in expert_params_mapping:
@@ -456,7 +462,7 @@ def _load_weight_experts(self, name: str, loaded_weight: torch.Tensor, params_di
             param = params_dict[name]
             load_weight(param, loaded_weight)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/siglip.py b/lmdeploy/pytorch/models/siglip.py
index 274a6a10bb..2444b3f0e0 100644
--- a/lmdeploy/pytorch/models/siglip.py
+++ b/lmdeploy/pytorch/models/siglip.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import math
-from typing import Iterable, Set, Tuple, Union
+from collections.abc import Iterable
 
 import torch
 from torch import nn
@@ -206,7 +206,7 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-    ) -> Tuple[torch.Tensor, None]:
+    ) -> tuple[torch.Tensor, None]:
         residual = hidden_states
         hidden_states = self.layer_norm1(hidden_states)
         hidden_states, _ = self.self_attn(hidden_states=hidden_states)
@@ -242,7 +242,7 @@ def forward(
         self,
         inputs_embeds: torch.Tensor,
         **kwargs,
-    ) -> Union[torch.Tensor, list[torch.Tensor]]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         hidden_states = inputs_embeds
 
         for encoder_layer in self.layers:
@@ -357,7 +357,7 @@ def forward(
             interpolate_pos_encoding=interpolate_pos_encoding,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ('qkv_proj', 'q_proj', 'q'),
@@ -365,7 +365,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
             ('qkv_proj', 'v_proj', 'v'),
         ]
         params_dict = dict(self.named_parameters())
-        loaded_params: Set[str] = set()
+        loaded_params: set[str] = set()
         layer_count = len(self.vision_model.encoder.layers)
 
         for name, loaded_weight in weights:
diff --git a/lmdeploy/pytorch/models/starcoder2.py b/lmdeploy/pytorch/models/starcoder2.py
index 509910a3e9..43c7c9bc6f 100644
--- a/lmdeploy/pytorch/models/starcoder2.py
+++ b/lmdeploy/pytorch/models/starcoder2.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Iterable, List, Optional, Tuple
+from collections.abc import Iterable
+from typing import Any
 
 import torch
 from torch import nn
@@ -62,8 +63,8 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: tuple[torch.Tensor] | None = None,
         attn_metadata: Any = None,
     ):
         """Rewrite of LlamaAttention.forward."""
@@ -171,9 +172,9 @@ def __init__(self,
     def forward(
         self,
         hidden_states: torch.Tensor,
-        rotary_pos_emb: Tuple[torch.FloatTensor, torch.FloatTensor],
-        past_key_value: Optional[List[torch.FloatTensor]],
-        residual: Optional[torch.Tensor] = None,
+        rotary_pos_emb: tuple[torch.FloatTensor, torch.FloatTensor],
+        past_key_value: list[torch.FloatTensor] | None,
+        residual: torch.Tensor | None = None,
         attn_metadata: Any = None,
     ):
         if residual is None:
@@ -227,10 +228,10 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
     def forward(
         self,
         input_ids: torch.LongTensor = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
         attn_metadata: Any = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor | None = None,
     ):
         """Rewrite of LlamaModel.forward."""
 
@@ -299,7 +300,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -328,8 +329,8 @@ def get_input_embeddings(self):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
@@ -355,7 +356,7 @@ def prepare_inputs_for_generation(
             inputs_embeds=inputs_embeds,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         # modify from vllm
         stacked_params_mapping = [
diff --git a/lmdeploy/pytorch/models/utils/cudagraph.py b/lmdeploy/pytorch/models/utils/cudagraph.py
index 2b5a4dc8ad..7b93b9f633 100644
--- a/lmdeploy/pytorch/models/utils/cudagraph.py
+++ b/lmdeploy/pytorch/models/utils/cudagraph.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import torch
 from torch import Tensor
@@ -8,7 +8,7 @@
 
 from lmdeploy.pytorch.model_inputs import StepContext, get_step_ctx_manager
 
-BuffType = Dict[str, Tensor]
+BuffType = dict[str, Tensor]
 
 
 def _get_meta_flashattn(
@@ -21,9 +21,9 @@ def _get_meta_flashattn(
         cache_seqlens: torch.Tensor,
         qkv_dtype=torch.bfloat16,
         headdim_v=None,
-        cu_seqlens_q: Optional[torch.Tensor] = None,
-        cu_seqlens_k_new: Optional[torch.Tensor] = None,
-        page_size: Optional[int] = None,
+        cu_seqlens_q: torch.Tensor | None = None,
+        cu_seqlens_k_new: torch.Tensor | None = None,
+        page_size: int | None = None,
         causal=True,
         window_size=(-1, -1),  # -1 means infinite context window
         num_splits=0,
@@ -77,7 +77,7 @@ class CudaGraphMeta:
     vocab_size: int = 1
     use_mla_fp8_cache: bool = False
     use_flash_mla: bool = False
-    mla_index_topk: Optional[int] = None
+    mla_index_topk: int | None = None
     decode_query_len: int = 1
     use_fa3_decoding: bool = False
 
@@ -89,7 +89,7 @@ def support_cuda_graph(
         self,
         input_ids: torch.Tensor,
         position_ids: torch.Tensor,
-        past_key_values: List[List[torch.Tensor]],
+        past_key_values: list[list[torch.Tensor]],
         attn_metadata: Any = None,
         inputs_embeds: torch.Tensor = None,
         **kwargs,
@@ -102,7 +102,7 @@ def make_output_buffers(self, output):
         if isinstance(output, torch.Tensor):
             output_buffers = dict(hidden_states=output)
         else:
-            assert isinstance(output, Dict)
+            assert isinstance(output, dict)
             output_buffers = output
         return output_buffers
 
@@ -138,7 +138,7 @@ def update_meta_flashattn(self, graph_meta: CudaGraphMeta, block_size: int, max_
         )
         return scheduler_metadata
 
-    def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, *args, past_key_values: List, **kwargs) -> BuffType:
+    def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, *args, past_key_values: list, **kwargs) -> BuffType:
         """Make cudagraph buffers from forward inputs."""
         max_batches = graph_meta.max_batchs
         max_tokens = graph_meta.max_tokens
@@ -194,8 +194,8 @@ def make_buffers_cudagraph(self, graph_meta: CudaGraphMeta, *args, past_key_valu
 
     @record_function('fill_buffers_cudagraph')
     def fill_buffers_cudagraph(self, graph_meta: CudaGraphMeta, input_ids: Tensor, position_ids: Tensor,
-                               past_key_values: List, attn_metadata: Any, inputs_embeds: Tensor,
-                               **kwargs) -> Dict[str, Tensor]:
+                               past_key_values: list, attn_metadata: Any, inputs_embeds: Tensor,
+                               **kwargs) -> dict[str, Tensor]:
         """Fill cudagraph buffers from forward inputs."""
 
         block_offsets: Tensor = attn_metadata.block_offsets
@@ -293,7 +293,7 @@ def update_context_cudagraph(self, graph_meta: CudaGraphMeta, context: StepConte
         context.kv_seqlens = input_buffers['kv_seqlens']
         context.q_start_loc = input_buffers['q_start_loc']
 
-    def get_outputs_cudagraph(self, output_buffers: Dict[str, torch.Tensor], input_ids: Tensor, **kwargs):
+    def get_outputs_cudagraph(self, output_buffers: dict[str, torch.Tensor], input_ids: Tensor, **kwargs):
         """Get outputs from buffers."""
         num_tokens = input_ids.size(-1)
         outputs = dict()
diff --git a/lmdeploy/pytorch/models/utils/model.py b/lmdeploy/pytorch/models/utils/model.py
index 7c63f3fea8..fe4a47802f 100644
--- a/lmdeploy/pytorch/models/utils/model.py
+++ b/lmdeploy/pytorch/models/utils/model.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import functools
-from typing import Iterable, List, Optional, Tuple
+from collections.abc import Iterable
 
 import torch
 
@@ -36,14 +36,14 @@ def forward(self, *args, **kwargs):
 
     def prepare_inputs_for_generation(
         self,
-        past_key_values: List[List[torch.Tensor]],
-        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: list[list[torch.Tensor]],
+        inputs_embeds: torch.Tensor | None = None,
         context: StepContext = None,
     ):
         """Prepare input."""
         raise NotImplementedError('Not Implemented')
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         """Load weights."""
         raise NotImplementedError('Not Implemented')
 
@@ -61,8 +61,8 @@ def update_weights(self):
         pass
 
     def update_model_metas(self,
-                           past_key_values: List[List[torch.Tensor]],
-                           inputs_embeds: Optional[torch.Tensor] = None,
+                           past_key_values: list[list[torch.Tensor]],
+                           inputs_embeds: torch.Tensor | None = None,
                            context: StepContext = None):
         """Update model meta."""
         return None
@@ -132,8 +132,8 @@ def build_lm_head(self,
                       hidden_size: int,
                       vocab_size: int,
                       bias: bool = False,
-                      dtype: Optional[torch.dtype] = None,
-                      device: Optional[torch.device] = None,
+                      dtype: torch.dtype | None = None,
+                      device: torch.device | None = None,
                       **kwargs):
         """Build LM Head."""
         bm_ctx = get_build_model_context()
diff --git a/lmdeploy/pytorch/models/utils/multimodal.py b/lmdeploy/pytorch/models/utils/multimodal.py
index 699f88021f..34a7e0de7e 100644
--- a/lmdeploy/pytorch/models/utils/multimodal.py
+++ b/lmdeploy/pytorch/models/utils/multimodal.py
@@ -1,9 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Tuple
 
 from lmdeploy.pytorch.multimodal.data_type import MultiModalInputs
 
-PreparedInputs = Tuple[List[int], MultiModalInputs]
+PreparedInputs = tuple[list[int], MultiModalInputs]
 
 
 class MultiModalMixin:
diff --git a/lmdeploy/pytorch/multimodal/data_type.py b/lmdeploy/pytorch/multimodal/data_type.py
index dd3ec9a37d..c379984658 100644
--- a/lmdeploy/pytorch/multimodal/data_type.py
+++ b/lmdeploy/pytorch/multimodal/data_type.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from dataclasses import dataclass, fields
-from typing import Any, Dict, List, Union
+from typing import Any
 
 from torch import Tensor
 
@@ -9,9 +9,9 @@ class MultiModalData:
     pass
 
 
-MultiModalDataList = List[MultiModalData]
+MultiModalDataList = list[MultiModalData]
 
-NestedTensor = Union[Tensor, List[Tensor]]
+NestedTensor = Tensor | list[Tensor]
 
 
 @dataclass
@@ -20,7 +20,7 @@ class MultiModalTensor:
     start: int
     end: int = None
     encoder_len: int = None
-    meta: Dict[str, Any] = None
+    meta: dict[str, Any] = None
 
     def __post_init__(self):
         if self.end is None:
@@ -56,4 +56,4 @@ def to_device(self, device: str, non_blocking: bool = False):
         return MultiModalTensor(**out_dict)
 
 
-MultiModalInputs = Dict[str, List[MultiModalTensor]]
+MultiModalInputs = dict[str, list[MultiModalTensor]]
diff --git a/lmdeploy/pytorch/multimodal/image_type.py b/lmdeploy/pytorch/multimodal/image_type.py
index 19211a381f..0d9664d2b9 100644
--- a/lmdeploy/pytorch/multimodal/image_type.py
+++ b/lmdeploy/pytorch/multimodal/image_type.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from dataclasses import dataclass
-from typing import Any, ClassVar, Dict
+from typing import Any, ClassVar
 
 from PIL import Image
 
@@ -11,5 +11,5 @@
 class ImageData(MultiModalData):
     data: Image
     loc: int
-    meta: Dict[str, Any] = None
+    meta: dict[str, Any] = None
     type: ClassVar[str] = 'image'
diff --git a/lmdeploy/pytorch/nn/__init__.py b/lmdeploy/pytorch/nn/__init__.py
index 2c89fac7c4..bc40334f35 100644
--- a/lmdeploy/pytorch/nn/__init__.py
+++ b/lmdeploy/pytorch/nn/__init__.py
@@ -5,9 +5,11 @@
 from .attention import Attention, FlashAttention  # noqa: F401
 from .embedding import ParallelEmbedding  # noqa: F401
 from .norm import LayerNorm, RMSNorm  # noqa: F401
-from .rotary_embedding import ApplyRotaryEmb  # noqa: F401
-from .rotary_embedding import RopeType  # noqa: F401
-from .rotary_embedding import YarnParameters  # noqa: F401
-from .rotary_embedding import build_rotary_embedding  # noqa: F401
-from .rotary_embedding import build_rotary_embedding_from_config  # noqa: F401
-from .rotary_embedding import build_rotary_params  # noqa: F401
+from .rotary_embedding import (
+    ApplyRotaryEmb,  # noqa: F401
+    RopeType,  # noqa: F401
+    YarnParameters,  # noqa: F401
+    build_rotary_embedding,  # noqa: F401
+    build_rotary_embedding_from_config,  # noqa: F401
+    build_rotary_params,  # noqa: F401
+)
diff --git a/lmdeploy/pytorch/nn/gated_delta.py b/lmdeploy/pytorch/nn/gated_delta.py
index c61dcab6b5..e15be9c89f 100644
--- a/lmdeploy/pytorch/nn/gated_delta.py
+++ b/lmdeploy/pytorch/nn/gated_delta.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Any, Sequence, Tuple
+from collections.abc import Sequence
+from typing import Any
 
 import torch
 from torch import nn
@@ -198,7 +199,7 @@ def __init__(
         self,
         in_channels: int,
         out_channels: int,
-        kernel_size: int | Tuple[int],
+        kernel_size: int | tuple[int],
         split: Sequence[int],
         groups: int = 1,
         bias: bool = True,
@@ -232,7 +233,7 @@ def __init__(
     def make_weight(
         in_channels: int,
         out_channels: int,
-        kernel_size: int | Tuple[int],
+        kernel_size: int | tuple[int],
         groups: int = 1,
         bias: bool = True,
         device: str | torch.device | None = None,
@@ -273,6 +274,6 @@ def forward(self, x: torch.Tensor, conv_state: torch.Tensor, gated_delta_meta: G
 
 
 @record_function('gated_delta_load_state')
-def load_state(past_key_value: Tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta):
+def load_state(past_key_value: tuple[torch.Tensor, torch.Tensor], gated_delta_meta: GatedDeltaMeta):
     """Load states from cache."""
     return past_key_value[:2]
diff --git a/lmdeploy/pytorch/nn/linear/__init__.py b/lmdeploy/pytorch/nn/linear/__init__.py
index 7fda2087bf..3dd3df995b 100644
--- a/lmdeploy/pytorch/nn/linear/__init__.py
+++ b/lmdeploy/pytorch/nn/linear/__init__.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import torch
 from torch import nn
@@ -19,11 +19,11 @@ def build_linear(
     in_features: int,
     out_features: int,
     bias: bool,
-    dtype: Optional[torch.dtype] = None,
-    device: Optional[torch.device] = None,
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
     colwise: bool = True,
     is_tp: bool = False,
-    quant_config: Dict = None,
+    quant_config: dict = None,
     all_reduce: bool = True,
     tp_align_size: int = 1,
     dp_gather: bool = False,
@@ -104,11 +104,11 @@ def build_colwise_linear(
     in_features: int,
     out_features: int,
     bias: bool,
-    dtype: Optional[torch.dtype] = None,
-    device: Optional[torch.device] = None,
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
     is_tp: bool = False,
     tp_align_size: int = 1,
-    quant_config: Dict = None,
+    quant_config: dict = None,
     dp_disable_tp: bool = False,
     dp_gather: bool = False,
     check_dist: bool = True,
@@ -148,11 +148,11 @@ def build_rowwise_linear(
     in_features: int,
     out_features: int,
     bias: bool,
-    dtype: Optional[torch.dtype] = None,
-    device: Optional[torch.device] = None,
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
     is_tp: bool = False,
     tp_align_size: int = 1,
-    quant_config: Dict = None,
+    quant_config: dict = None,
     all_reduce: bool = True,
     dp_disable_tp: bool = False,
     check_dist: bool = True,
@@ -183,13 +183,13 @@ def build_rowwise_linear(
 
 def build_merged_colwise_linear(
     in_features: int,
-    all_out_features: List[int],
+    all_out_features: list[int],
     bias: bool,
-    dtype: Optional[torch.dtype] = None,
-    device: Optional[torch.device] = None,
-    quant_config: Dict = None,
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
+    quant_config: dict = None,
     is_tp: bool = True,
-    out_names: List[Any] = None,
+    out_names: list[Any] = None,
     dp_gather: bool = False,
     check_dist: bool = True,
     layer_type: str = 'attn',
@@ -261,9 +261,9 @@ def build_qkv_proj(in_features: int,
                    head_size: int,
                    head_size_v: int = None,
                    bias: bool = False,
-                   quant_config: Dict = None,
-                   dtype: Optional[torch.dtype] = None,
-                   device: Optional[torch.device] = None,
+                   quant_config: dict = None,
+                   dtype: torch.dtype | None = None,
+                   device: torch.device | None = None,
                    is_tp: bool = True,
                    num_replicate_kv_heads: int = 1,
                    prefix: str = ''):
@@ -335,11 +335,11 @@ def build_o_proj(
     in_features: int,
     out_features: int,
     bias: bool,
-    dtype: Optional[torch.dtype] = None,
-    device: Optional[torch.device] = None,
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
     is_tp: bool = False,
     tp_align_size: int = 1,
-    quant_config: Dict = None,
+    quant_config: dict = None,
     all_reduce: bool = True,
     prefix: str = '',
 ) -> nn.Module:
@@ -365,13 +365,13 @@ def build_o_proj(
 
 def build_gateup_linear(
     in_features: int,
-    all_out_features: List[int],
+    all_out_features: list[int],
     bias: bool,
-    dtype: Optional[torch.dtype] = None,
-    device: Optional[torch.device] = None,
-    quant_config: Dict = None,
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
+    quant_config: dict = None,
     is_tp: bool = True,
-    out_names: List[Any] = None,
+    out_names: list[Any] = None,
     dp_gather: bool = True,
     prefix: str = '',
 ):
@@ -401,11 +401,11 @@ def build_down_linear(
     in_features: int,
     out_features: int,
     bias: bool,
-    dtype: Optional[torch.dtype] = None,
-    device: Optional[torch.device] = None,
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
     is_tp: bool = False,
     tp_align_size: int = 1,
-    quant_config: Dict = None,
+    quant_config: dict = None,
     all_reduce: bool = True,
     prefix: str = '',
 ) -> nn.Module:
diff --git a/lmdeploy/pytorch/nn/linear/awq.py b/lmdeploy/pytorch/nn/linear/awq.py
index 5e24d93db7..aa6303ce46 100644
--- a/lmdeploy/pytorch/nn/linear/awq.py
+++ b/lmdeploy/pytorch/nn/linear/awq.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, List, Optional
+from typing import Any
 
 import torch
 
@@ -21,7 +21,7 @@ def __init__(
         w_bit: int,
         group_size: int,
         bias: bool,
-        device: Optional[torch.device] = None,
+        device: torch.device | None = None,
         colwise: bool = True,
         is_tp: bool = False,
         all_reduce: bool = True,
@@ -68,7 +68,7 @@ def register_all_parameters(self,
                                 qweight: torch.Tensor,
                                 scales: torch.Tensor,
                                 qzeros: torch.Tensor,
-                                bias: Optional[torch.Tensor] = None):
+                                bias: torch.Tensor | None = None):
         """Register all parameters."""
         qweight = torch.nn.Parameter(qweight, requires_grad=False)
         scales = torch.nn.Parameter(scales, requires_grad=False)
@@ -173,13 +173,13 @@ class MergedAwqLinear(AwqLinear):
 
     def __init__(self,
                  in_features: int,
-                 all_out_features: List[int],
+                 all_out_features: list[int],
                  w_bit: int,
                  group_size: int,
                  bias: bool,
-                 device: Optional[torch.device] = None,
+                 device: torch.device | None = None,
                  is_tp: bool = True,
-                 out_names: Optional[List[int]] = None,
+                 out_names: list[int] | None = None,
                  layer_type: str = 'attn'):
         self.init_tp_args(is_tp, all_reduce=False, colwise=True, layer_type=layer_type)
 
@@ -225,7 +225,7 @@ def _get_io_features(self, in_features: int, out_features: int, w_bit: int, grou
         """Get io features."""
         return in_features, out_features
 
-    def _update_all_out_features(self, all_out_features: List[int], w_bit: int, group_size: int):
+    def _update_all_out_features(self, all_out_features: list[int], w_bit: int, group_size: int):
         """Update all out features."""
         world_size, rank = self.get_tp_world_rank()
         new_all_out_features = []
@@ -280,7 +280,7 @@ def __init__(self,
                  w_bit: int,
                  group_size: int,
                  bias: bool = False,
-                 device: Optional[torch.device] = None,
+                 device: torch.device | None = None,
                  is_tp: bool = True,
                  num_replicate_kv_heads: int = 1):
         self.init_tp_args(is_tp, all_reduce=False, colwise=True, layer_type='attn')
@@ -309,7 +309,7 @@ def __init__(self,
                          out_names=out_names,
                          layer_type='attn')
 
-    def _update_all_out_features(self, all_out_features: List[int], w_bit: int, group_size: int):
+    def _update_all_out_features(self, all_out_features: list[int], w_bit: int, group_size: int):
         """Update all out features."""
         return all_out_features
 
diff --git a/lmdeploy/pytorch/nn/linear/base.py b/lmdeploy/pytorch/nn/linear/base.py
index 53bd6f6083..e1bff38cf9 100644
--- a/lmdeploy/pytorch/nn/linear/base.py
+++ b/lmdeploy/pytorch/nn/linear/base.py
@@ -1,13 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Callable, List, Optional
+from collections.abc import Callable
 
 import torch
 import torch.distributed as dist
 from torch import nn
 
 from lmdeploy.pytorch.config import TPMode
-from lmdeploy.pytorch.distributed import (gather_by_tp_sizes, get_dist_group, get_dist_manager, get_tp_world_rank,
-                                          reduce_scatter_by_tp_sizes)
+from lmdeploy.pytorch.distributed import (
+    gather_by_tp_sizes,
+    get_dist_group,
+    get_dist_manager,
+    get_tp_world_rank,
+    reduce_scatter_by_tp_sizes,
+)
 from lmdeploy.pytorch.model_inputs import get_step_ctx_manager
 
 from .utils import update_tp_args
@@ -30,12 +35,12 @@ def __init__(self, gemm_func: Callable, max_tokens_per_round: int = 8192):
         self.tp_group = tp_group.gpu_group
         self.max_tokens_per_round = max_tokens_per_round * self.attn_tp // self.tp // 2
 
-    def all_gather(self, hidden_states: torch.Tensor, tp_sizes: List[int]):
+    def all_gather(self, hidden_states: torch.Tensor, tp_sizes: list[int]):
         """All gather."""
         hidden_states, handle = dist.gather_by_tp_sizes(hidden_states, tp_sizes, group=self.gather_group, async_op=True)
         return hidden_states, handle
 
-    def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor, tp_sizes: List[int]):
+    def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor, tp_sizes: list[int]):
         """Reduce scatter."""
         hidden_states_list = list(hidden_states.split(tp_sizes, -2))
         cur_out_states = hidden_states_list[self.gather_rank]
@@ -45,7 +50,7 @@ def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor,
         handle = dist.reduce_scatter(out_states, hidden_states_list, group=self.tp_group, async_op=True)
         return out_states, handle
 
-    def _gemm_and_reduce_scatter(self, hidden_states: torch.Tensor, output_states: torch.Tensor, tp_sizes: List[int],
+    def _gemm_and_reduce_scatter(self, hidden_states: torch.Tensor, output_states: torch.Tensor, tp_sizes: list[int],
                                  handle: dist.Work):
         """Gemm and reduce scatter."""
         handle.wait()
@@ -108,8 +113,8 @@ class LinearBase(nn.Module):
 
     def __init__(
         self,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
         colwise: bool = True,
         is_tp: bool = False,
         all_reduce: bool = True,
@@ -177,11 +182,11 @@ def update_weights(self):
         """Update weights."""
         raise NotImplementedError('This method should be implemented in subclasses.')
 
-    def _forward_default(self, x, all_reduce: bool, tp_sizes: List[int]):
+    def _forward_default(self, x, all_reduce: bool, tp_sizes: list[int]):
         """Default forward implement."""
         raise NotImplementedError('This method should be implemented in subclasses.')
 
-    def _forward_lora(self, x, tp_sizes: List[int] = None):
+    def _forward_lora(self, x, tp_sizes: list[int] = None):
         """Forward with LoRA."""
         out = self._forward_default(x, False, tp_sizes)
 
diff --git a/lmdeploy/pytorch/nn/linear/blocked_fp8.py b/lmdeploy/pytorch/nn/linear/blocked_fp8.py
index 04d3c03e1e..bcadeef79c 100644
--- a/lmdeploy/pytorch/nn/linear/blocked_fp8.py
+++ b/lmdeploy/pytorch/nn/linear/blocked_fp8.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, List, Optional
+from typing import Any
 
 import torch
 
@@ -21,10 +21,10 @@ def __init__(
         in_features: int,
         out_features: int,
         bias: bool,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
         fp8_dtype: torch.dtype = torch.float8_e4m3fn,
-        scale_fmt: Optional[str] = None,
+        scale_fmt: str | None = None,
         colwise: bool = True,
         is_tp: bool = False,
         all_reduce: bool = True,
@@ -66,7 +66,7 @@ def setup_loaders(self):
     def register_all_parameters(self,
                                 weight: torch.Tensor,
                                 weight_scale_inv: torch.Tensor,
-                                bias: Optional[torch.Tensor] = None):
+                                bias: torch.Tensor | None = None):
         """Register all parameters."""
         weight = torch.nn.Parameter(weight, requires_grad=False)
         weight_scale_inv = torch.nn.Parameter(weight_scale_inv, requires_grad=False)
@@ -167,15 +167,15 @@ class MergedBlockedF8Linear(BlockedF8Linear):
 
     def __init__(self,
                  in_features: int,
-                 all_out_features: List[int],
+                 all_out_features: list[int],
                  bias: bool,
                  fp8_dtype: torch.dtype = torch.float8_e4m3fn,
-                 scale_fmt: Optional[str] = None,
-                 replicate: Optional[List[bool]] = None,
-                 dtype: Optional[torch.dtype] = None,
-                 device: Optional[torch.device] = None,
+                 scale_fmt: str | None = None,
+                 replicate: list[bool] | None = None,
+                 dtype: torch.dtype | None = None,
+                 device: torch.device | None = None,
                  is_tp: bool = True,
-                 out_names: Optional[List[int]] = None,
+                 out_names: list[int] | None = None,
                  dp_gather: bool = False,
                  layer_type: str = 'attn'):
         self.init_tp_args(is_tp, all_reduce=False, colwise=True, layer_type=layer_type)
@@ -222,7 +222,7 @@ def _get_io_features(self, in_features: int, out_features: int, colwise: bool):
         """Get io features."""
         return in_features, out_features
 
-    def _update_all_out_features(self, all_out_features: List[int], replicate: Optional[List[bool]]):
+    def _update_all_out_features(self, all_out_features: list[int], replicate: list[bool] | None):
         """Update all out features."""
         world_size, rank = self.get_tp_world_rank()
         new_all_out_features = []
@@ -281,9 +281,9 @@ def __init__(self,
                  head_size_v: int,
                  bias: bool = False,
                  fp8_dtype: torch.dtype = torch.float8_e4m3fn,
-                 scale_fmt: Optional[str] = None,
-                 dtype: Optional[torch.dtype] = None,
-                 device: Optional[torch.device] = None,
+                 scale_fmt: str | None = None,
+                 dtype: torch.dtype | None = None,
+                 device: torch.device | None = None,
                  is_tp: bool = True,
                  dp_gather: bool = False,
                  num_replicate_kv_heads: int = 1):
@@ -313,7 +313,7 @@ def __init__(self,
                          dp_gather=dp_gather,
                          layer_type='attn')
 
-    def _update_all_out_features(self, all_out_features: List[int], replicate: Optional[List[bool]]):
+    def _update_all_out_features(self, all_out_features: list[int], replicate: list[bool] | None):
         """Update all out features."""
         return all_out_features
 
diff --git a/lmdeploy/pytorch/nn/linear/default.py b/lmdeploy/pytorch/nn/linear/default.py
index a3f8a31a2c..e17f50d76b 100644
--- a/lmdeploy/pytorch/nn/linear/default.py
+++ b/lmdeploy/pytorch/nn/linear/default.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, List, Optional
+from typing import Any
 
 import torch
 
@@ -20,8 +20,8 @@ def __init__(
         in_features: int,
         out_features: int,
         bias: bool,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
         colwise: bool = True,
         is_tp: bool = False,
         all_reduce: bool = True,
@@ -53,7 +53,7 @@ def setup_loaders(self):
         if self.bias is not None:
             self.bias.weight_loader = self.weight_loader
 
-    def register_all_parameters(self, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
+    def register_all_parameters(self, weight: torch.Tensor, bias: torch.Tensor | None = None):
         """Register all parameters."""
         weight = torch.nn.Parameter(weight, requires_grad=False)
         if bias is not None:
@@ -135,12 +135,12 @@ class MergedBaseLinear(BaseLinear):
 
     def __init__(self,
                  in_features: int,
-                 all_out_features: List[int],
+                 all_out_features: list[int],
                  bias: bool,
-                 dtype: Optional[torch.dtype] = None,
-                 device: Optional[torch.device] = None,
+                 dtype: torch.dtype | None = None,
+                 device: torch.device | None = None,
                  is_tp: bool = True,
-                 out_names: Optional[List[int]] = None,
+                 out_names: list[int] | None = None,
                  dp_gather: bool = False,
                  layer_type: str = 'attn'):
         self.init_tp_args(is_tp, all_reduce=False, colwise=True, layer_type=layer_type)
@@ -175,7 +175,7 @@ def _get_io_features(self, in_features: int, out_features: int, colwise: bool):
         """Get io features."""
         return in_features, out_features
 
-    def _update_all_out_features(self, all_out_features: List[int]):
+    def _update_all_out_features(self, all_out_features: list[int]):
         """Update all out features."""
         world_size, rank = self.get_tp_world_rank()
         new_all_out_features = []
@@ -210,8 +210,8 @@ def __init__(self,
                  head_size: int,
                  head_size_v: int,
                  bias: bool = False,
-                 dtype: Optional[torch.dtype] = None,
-                 device: Optional[torch.device] = None,
+                 dtype: torch.dtype | None = None,
+                 device: torch.device | None = None,
                  is_tp: bool = True,
                  num_replicate_kv_heads: int = 1):
         self.init_tp_args(is_tp, all_reduce=False, colwise=True, layer_type='attn')
@@ -236,7 +236,7 @@ def __init__(self,
                          out_names=out_names,
                          layer_type='attn')
 
-    def _update_all_out_features(self, all_out_features: List[int]):
+    def _update_all_out_features(self, all_out_features: list[int]):
         """Update all out features."""
         return all_out_features
 
diff --git a/lmdeploy/pytorch/nn/linear/w8a8.py b/lmdeploy/pytorch/nn/linear/w8a8.py
index c9105e5599..ad4ec74d73 100644
--- a/lmdeploy/pytorch/nn/linear/w8a8.py
+++ b/lmdeploy/pytorch/nn/linear/w8a8.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, List, Optional
+from typing import Any
 
 import torch
 
@@ -18,12 +18,12 @@ def __init__(self,
                  in_features: int,
                  out_features: int,
                  bias: bool,
-                 dtype: Optional[torch.dtype] = None,
-                 device: Optional[torch.device] = None,
+                 dtype: torch.dtype | None = None,
+                 device: torch.device | None = None,
                  colwise: bool = True,
                  is_tp: bool = False,
                  all_reduce: bool = True,
-                 quant_dtype: Optional[torch.dtype] = torch.int8,
+                 quant_dtype: torch.dtype | None = torch.int8,
                  layer_type: str = 'attn'):
         super().__init__(dtype=torch.float16,
                          device=device,
@@ -53,7 +53,7 @@ def setup_loaders(self):
         if self.bias is not None:
             self.bias.weight_loader = self.weight_loader
 
-    def register_all_parameters(self, weight: torch.Tensor, scale: torch.Tensor, bias: Optional[torch.Tensor] = None):
+    def register_all_parameters(self, weight: torch.Tensor, scale: torch.Tensor, bias: torch.Tensor | None = None):
         """Register all parameters."""
         weight = torch.nn.Parameter(weight, requires_grad=False)
         scale = torch.nn.Parameter(scale, requires_grad=False)
@@ -131,12 +131,12 @@ class MergedW8A8Linear(W8A8Linear):
 
     def __init__(self,
                  in_features: int,
-                 all_out_features: List[int],
+                 all_out_features: list[int],
                  bias: bool,
-                 dtype: Optional[torch.dtype] = None,
-                 device: Optional[torch.device] = None,
+                 dtype: torch.dtype | None = None,
+                 device: torch.device | None = None,
                  is_tp: bool = True,
-                 out_names: Optional[List[int]] = None,
+                 out_names: list[int] | None = None,
                  quant_dtype: torch.dtype = torch.int8,
                  layer_type: str = 'attn'):
         self.init_tp_args(is_tp, all_reduce=False, colwise=True, layer_type=layer_type)
@@ -173,7 +173,7 @@ def _get_io_features(self, in_features: int, out_features: int, colwise: bool):
         """Get io features."""
         return in_features, out_features
 
-    def _update_all_out_features(self, all_out_features: List[int]):
+    def _update_all_out_features(self, all_out_features: list[int]):
         """Update all out features."""
         world_size, rank = self.get_tp_world_rank()
         new_all_out_features = []
@@ -208,8 +208,8 @@ def __init__(self,
                  head_size: int,
                  head_size_v: int,
                  bias: bool = False,
-                 dtype: Optional[torch.dtype] = None,
-                 device: Optional[torch.device] = None,
+                 dtype: torch.dtype | None = None,
+                 device: torch.device | None = None,
                  is_tp: bool = True,
                  num_replicate_kv_heads: int = 1,
                  quant_dtype: torch.dtype = torch.int8):
@@ -236,7 +236,7 @@ def __init__(self,
                          quant_dtype=quant_dtype,
                          layer_type='attn')
 
-    def _update_all_out_features(self, all_out_features: List[int]):
+    def _update_all_out_features(self, all_out_features: list[int]):
         """Update all out features."""
         return all_out_features
 
diff --git a/lmdeploy/pytorch/nn/moe/__init__.py b/lmdeploy/pytorch/nn/moe/__init__.py
index cb8725a581..f0e1fe103c 100644
--- a/lmdeploy/pytorch/nn/moe/__init__.py
+++ b/lmdeploy/pytorch/nn/moe/__init__.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Callable, Dict, Optional
+from collections.abc import Callable
 
 import torch
 
@@ -15,11 +15,11 @@ def build_fused_moe(
     top_k: int,
     bias: bool = False,
     renormalize: bool = False,
-    dtype: Optional[torch.dtype] = None,
-    device: Optional[torch.device] = None,
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
     all_reduce: bool = True,
     enable_ep: bool = False,
-    quant_config: Dict = None,
+    quant_config: dict = None,
     layer_idx: int = 0,
     act_func: Callable = None,
     prefix: str = '',
diff --git a/lmdeploy/pytorch/nn/moe/base.py b/lmdeploy/pytorch/nn/moe/base.py
index 484dbbe492..76f1927d46 100644
--- a/lmdeploy/pytorch/nn/moe/base.py
+++ b/lmdeploy/pytorch/nn/moe/base.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Callable
 from dataclasses import dataclass
 from enum import Enum, auto
-from typing import Callable, Dict, List, Optional
 
 import torch
 import torch.nn as nn
@@ -51,7 +51,7 @@ def split_size(size: int, world_size: int, align: int):
     return split_size
 
 
-def moe_gather_inputs(hidden_states, topk_weights, topk_ids, group: Optional[dist.ProcessGroup] = None):
+def moe_gather_inputs(hidden_states, topk_weights, topk_ids, group: dist.ProcessGroup | None = None):
     dist_config = get_dist_manager().current_config()
     tp = dist_config.moe_tp
     if tp == 1:
@@ -73,7 +73,7 @@ def moe_gather_inputs(hidden_states, topk_weights, topk_ids, group: Optional[dis
     return hidden_states, topk_weights, topk_ids
 
 
-def moe_reduce(ret, rank: int, tp_mode: TPMode, group: Optional[dist.ProcessGroup] = None):
+def moe_reduce(ret, rank: int, tp_mode: TPMode, group: dist.ProcessGroup | None = None):
     dist_config = get_dist_manager().current_config()
     if dist_config.moe_tp == 1:
         return ret
@@ -109,14 +109,14 @@ def __init__(self, gemm_func: Callable, max_tokens_per_round: int = 8192):
         self.max_tokens_per_round = max_tokens_per_round * self.attn_tp // self.tp
 
     def all_gather(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                   tp_sizes: List[int]):
+                   tp_sizes: list[int]):
         """All gather."""
         hidden_states, h0 = dist.gather_by_tp_sizes(hidden_states, tp_sizes, group=self.gather_group, async_op=True)
         topk_weights, h1 = dist.gather_by_tp_sizes(topk_weights, tp_sizes, group=self.gather_group, async_op=True)
         topk_ids, h2 = dist.gather_by_tp_sizes(topk_ids, tp_sizes, group=self.gather_group, async_op=True)
         return hidden_states, topk_weights, topk_ids, (h0, h1, h2)
 
-    def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor, tp_sizes: List[int]):
+    def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor, tp_sizes: list[int]):
         """Reduce scatter."""
         hidden_states_list = list(hidden_states.split(tp_sizes, -2))
         cur_out_states = hidden_states_list[self.gather_rank]
@@ -127,7 +127,7 @@ def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor,
         return out_states, handle
 
     def _gemm_and_reduce_scatter(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                                 output_states: torch.Tensor, tp_sizes: List[int], handles: List[dist.Work]):
+                                 output_states: torch.Tensor, tp_sizes: list[int], handles: list[dist.Work]):
         """Gemm and reduce scatter."""
         for handle in handles:
             handle.wait()
@@ -210,7 +210,7 @@ class DispatchInputs:
     moe_type: MoeType = MoeType.Default
 
     @classmethod
-    def from_dict(cls, input: Dict):
+    def from_dict(cls, input: dict):
         """From dict."""
         assert ['hidden_states', 'topk_weights', 'topk_idx'] in input
         moe_type = input.get('moe_type', MoeType.Default)
@@ -221,7 +221,7 @@ def from_dict(cls, input: Dict):
             moe_type=moe_type,
         )
 
-    def to_dict(self) -> Dict:
+    def to_dict(self) -> dict:
         """To dict."""
         return {
             'hidden_states': self.hidden_states,
@@ -275,19 +275,19 @@ def before_dispatch(self, state: DispatchInputs):
         """Before dispatch."""
         raise NotImplementedError
 
-    def dispatch(self, state: Dict):
+    def dispatch(self, state: dict):
         """dispatch."""
         raise NotImplementedError
 
-    def gemm(self, state: Dict):
+    def gemm(self, state: dict):
         """gemm."""
         raise NotImplementedError
 
-    def combine(self, state: Dict):
+    def combine(self, state: dict):
         """combine."""
         raise NotImplementedError
 
-    def wait(self, state: Dict):
+    def wait(self, state: dict):
         """wait."""
         raise NotImplementedError
 
diff --git a/lmdeploy/pytorch/nn/moe/blocked_fp8.py b/lmdeploy/pytorch/nn/moe/blocked_fp8.py
index 8880b72571..807833212e 100644
--- a/lmdeploy/pytorch/nn/moe/blocked_fp8.py
+++ b/lmdeploy/pytorch/nn/moe/blocked_fp8.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Callable, Dict, List, Optional
+from collections.abc import Callable
 
 import torch
 
@@ -25,8 +25,8 @@ def __init__(self,
                  dtype: torch.dtype,
                  device: torch.device,
                  bias: bool = False,
-                 expert_list: List[int] = None,
-                 scale_fmt: Optional[str] = None):
+                 expert_list: list[int] = None,
+                 scale_fmt: str | None = None):
         super().__init__(num_experts=num_experts,
                          in_features=in_features,
                          out_features=out_features,
@@ -150,9 +150,9 @@ def __init__(self,
                  bias: bool = False,
                  renormalize: bool = False,
                  fp8_dtype: torch.dtype = torch.float8_e4m3fn,
-                 scale_fmt: Optional[str] = None,
-                 dtype: Optional[torch.dtype] = None,
-                 device: Optional[torch.device] = None,
+                 scale_fmt: str | None = None,
+                 dtype: torch.dtype | None = None,
+                 device: torch.device | None = None,
                  all_reduce: bool = True,
                  layer_idx: int = 0,
                  act_func: Callable = None):
@@ -239,7 +239,7 @@ def update_weights(self):
 
     def before_dispatch(self, state: DispatchInputs):
         """Before dispatch."""
-        if not isinstance(state, Dict):
+        if not isinstance(state, dict):
             state = state.to_dict()
 
         moe_type = state['moe_type']
@@ -252,7 +252,7 @@ def before_dispatch(self, state: DispatchInputs):
             state['previous_event'] = previous_event
         return state
 
-    def dispatch(self, state: Dict):
+    def dispatch(self, state: dict):
         moe_type = state['moe_type']
         if moe_type == MoeType.DSAsyncPrefill:
             fusedmoe = state['fusedmoe']
@@ -315,7 +315,7 @@ def dispatch(self, state: Dict):
             }
         return recv_state
 
-    def gemm(self, state: Dict):
+    def gemm(self, state: dict):
         moe_type = state['moe_type']
         if moe_type == MoeType.DSAsyncPrefill:
             if (state['recv_hidden_states'][0]
@@ -364,7 +364,7 @@ def gemm(self, state: Dict):
                 gemm_state = {'hidden_states': hidden_states, 'moe_type': state['moe_type']}
         return gemm_state
 
-    def combine(self, state: Dict):
+    def combine(self, state: dict):
         moe_type = state['moe_type']
         if moe_type == MoeType.DSAsyncPrefill:
             fusedmoe = state['fusedmoe']
diff --git a/lmdeploy/pytorch/nn/moe/default.py b/lmdeploy/pytorch/nn/moe/default.py
index 0633aa001a..efb5f4483c 100644
--- a/lmdeploy/pytorch/nn/moe/default.py
+++ b/lmdeploy/pytorch/nn/moe/default.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from collections import defaultdict
-from typing import Callable, Dict, List, Optional
+from collections.abc import Callable
 
 import torch
 from torch import nn
@@ -22,7 +22,7 @@ def __init__(self,
                  dtype: torch.dtype,
                  device: torch.device,
                  bias: bool = False,
-                 expert_list: Optional[List[int]] = None):
+                 expert_list: list[int] | None = None):
         super().__init__()
         weight = torch.empty((num_experts, out_features, in_features), dtype=dtype, device=device)
         weight = torch.nn.Parameter(weight, requires_grad=False)
@@ -115,8 +115,8 @@ def __init__(self,
                  top_k: int,
                  bias: bool = False,
                  renormalize: bool = False,
-                 dtype: Optional[torch.dtype] = None,
-                 device: Optional[torch.device] = None,
+                 dtype: torch.dtype | None = None,
+                 device: torch.device | None = None,
                  all_reduce: bool = True,
                  layer_idx: int = 0,
                  act_func: Callable = None):
@@ -188,7 +188,7 @@ def update_weights(self):
 
     def before_dispatch(self, state: DispatchInputs):
         """Before dispatch."""
-        if not isinstance(state, Dict):
+        if not isinstance(state, dict):
             state = state.to_dict()
 
         moe_type = state['moe_type']
@@ -199,7 +199,7 @@ def before_dispatch(self, state: DispatchInputs):
             state['previous_event'] = previous_event
         return state
 
-    def dispatch(self, state: Dict):
+    def dispatch(self, state: dict):
         """dispatch."""
         moe_type = state['moe_type']
         if moe_type == MoeType.DSAsyncPrefill:
@@ -265,7 +265,7 @@ def dispatch(self, state: Dict):
             raise NotImplementedError(f'Not supported moe type: {moe_type}')
         return recv_state
 
-    def gemm(self, state: Dict):
+    def gemm(self, state: dict):
         """gemm."""
         moe_type = state['moe_type']
         if moe_type == MoeType.DSAsyncPrefill:
@@ -311,7 +311,7 @@ def gemm(self, state: Dict):
             gemm_state = {'hidden_states': hidden_states, 'moe_type': state['moe_type']}
         return gemm_state
 
-    def combine(self, state: Dict):
+    def combine(self, state: dict):
         """combine."""
         moe_type = state['moe_type']
         if moe_type == MoeType.DSAsyncPrefill:
@@ -355,7 +355,7 @@ def combine(self, state: Dict):
             raise NotImplementedError(f'Not supported moe type: {moe_type}')
         return out_state
 
-    def wait(self, state: Dict):
+    def wait(self, state: dict):
         """wait."""
         if state.get('event', None) is not None:
             state['fusedmoe'].wait(state['event'])
diff --git a/lmdeploy/pytorch/nn/moe/route.py b/lmdeploy/pytorch/nn/moe/route.py
index 320f483149..f71fa5a55c 100644
--- a/lmdeploy/pytorch/nn/moe/route.py
+++ b/lmdeploy/pytorch/nn/moe/route.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
 
 import torch
 
@@ -34,6 +33,6 @@ def __init__(
         )
 
     def forward(self, router_logits: torch.Tensor,
-                e_score_correction_bias: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+                e_score_correction_bias: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         """Router forward."""
         return self.impl.forward(router_logits, e_score_correction_bias)
diff --git a/lmdeploy/pytorch/nn/moe/w8a8.py b/lmdeploy/pytorch/nn/moe/w8a8.py
index cf31b98610..62ca258bde 100644
--- a/lmdeploy/pytorch/nn/moe/w8a8.py
+++ b/lmdeploy/pytorch/nn/moe/w8a8.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Optional
 
 import torch
 
@@ -19,7 +18,7 @@ def __init__(self,
                  out_features: int,
                  weight_type: str,
                  device: torch.device,
-                 expert_list: List[int] = None,
+                 expert_list: list[int] = None,
                  quant_dtype: torch.dtype = torch.int8):
         super().__init__(
             num_experts=num_experts,
@@ -75,9 +74,9 @@ def __init__(self,
                  num_experts: int,
                  top_k: int,
                  renormalize: bool = False,
-                 dtype: Optional[torch.dtype] = None,
-                 quant_dtype: Optional[torch.dtype] = torch.int8,
-                 device: Optional[torch.device] = None,
+                 dtype: torch.dtype | None = None,
+                 quant_dtype: torch.dtype | None = torch.int8,
+                 device: torch.device | None = None,
                  all_reduce: bool = True):
 
         device = device or torch.device('cpu')
@@ -133,7 +132,7 @@ def update_weights(self):
         self.gate_up.update_weight(gate_up_weights, gate_up_scale)
         self.down.update_weight(down_weights, down_scale)
 
-    def dispatch(self, state: Dict):
+    def dispatch(self, state: dict):
         """dispatch."""
         moe_type = state['moe_type']
         if moe_type == MoeType.Default:
@@ -151,7 +150,7 @@ def dispatch(self, state: Dict):
             raise NotImplementedError(f'Not supported moe type: {moe_type}')
         return recv_state
 
-    def gemm(self, state: Dict):
+    def gemm(self, state: dict):
         """gemm."""
         hidden_states = state['hidden_states']
         topk_weights = state['topk_weights']
@@ -161,7 +160,7 @@ def gemm(self, state: Dict):
                                 self.down.weight, self.down.scale, self.expert_list)
         return dict(hidden_states=ret, moe_type=state['moe_type'])
 
-    def combine(self, state: Dict):
+    def combine(self, state: dict):
         """combine."""
         moe_type = state['moe_type']
         if moe_type == MoeType.Default:
@@ -175,6 +174,6 @@ def combine(self, state: Dict):
             raise NotImplementedError(f'Not supported moe type: {moe_type}')
         return out_state
 
-    def wait(self, state: Dict):
+    def wait(self, state: dict):
         """wait."""
         raise NotImplementedError
diff --git a/lmdeploy/pytorch/nn/norm.py b/lmdeploy/pytorch/nn/norm.py
index 7e39ed4312..770493f9ab 100644
--- a/lmdeploy/pytorch/nn/norm.py
+++ b/lmdeploy/pytorch/nn/norm.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict
 
 import torch
 from torch import nn
@@ -20,7 +19,7 @@ def __init__(
         eps: float = 1e-6,
         dtype: torch.dtype | None = None,
         device: torch.device | None = None,
-        quant_config: Dict | None = None,
+        quant_config: dict | None = None,
         tp: bool = False,
         align: int = 1,
         prefix: str = '',
diff --git a/lmdeploy/pytorch/nn/rotary_embedding.py b/lmdeploy/pytorch/nn/rotary_embedding.py
index 740a39ed2e..3fa8cfda81 100644
--- a/lmdeploy/pytorch/nn/rotary_embedding.py
+++ b/lmdeploy/pytorch/nn/rotary_embedding.py
@@ -7,8 +7,13 @@
 from transformers import PretrainedConfig
 
 from ..backends import OpType, get_backend
-from ..backends.rotary_embedding import (FopeParameters, Llama3Parameters, LongRoPEScalingParameters, RopeType,
-                                         YarnParameters)
+from ..backends.rotary_embedding import (
+    FopeParameters,
+    Llama3Parameters,
+    LongRoPEScalingParameters,
+    RopeType,
+    YarnParameters,
+)
 
 
 def get_rope_parameters(config: PretrainedConfig):
diff --git a/lmdeploy/pytorch/paging/block_manager/base_block_manager.py b/lmdeploy/pytorch/paging/block_manager/base_block_manager.py
index 1aeeaed031..d1f59b66a6 100644
--- a/lmdeploy/pytorch/paging/block_manager/base_block_manager.py
+++ b/lmdeploy/pytorch/paging/block_manager/base_block_manager.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import time
-from typing import Dict
 
 import numpy as np
 
@@ -213,7 +212,7 @@ def __init__(self, num_gpu_blocks: int, num_cpu_blocks: int, num_gpu_reserved: i
 
         self.allocator = LogicalAllocator(num_cpu_blocks, num_gpu_blocks, num_gpu_reserved)
 
-        self.block_tables: Dict[int, BlockTable] = {}
+        self.block_tables: dict[int, BlockTable] = {}
 
     @classmethod
     def num_required_blocks(cls, obj: SchedulerSequence, prealloc_size: int = 0):
diff --git a/lmdeploy/pytorch/paging/block_trie.py b/lmdeploy/pytorch/paging/block_trie.py
index 2244f3f1b5..d20aa665d2 100644
--- a/lmdeploy/pytorch/paging/block_trie.py
+++ b/lmdeploy/pytorch/paging/block_trie.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import heapq
 from dataclasses import dataclass
-from typing import Dict, Set
 
 import numpy as np
 
@@ -33,8 +32,8 @@ def __init__(self, hash_key: int, block: int, tokens: np.ndarray, num_matched: i
         self.block = block
         self.tokens = tokens
         self.num_matched = num_matched
-        self.children: Dict[int, 'Node'] = dict()
-        self._parent: 'Node' = None
+        self.children: dict[int, Node] = dict()
+        self._parent: Node = None
 
     @property
     def parent(self):
@@ -67,8 +66,8 @@ def __init__(self, cache_config: CacheConfig, block_manager: BaseBlockManager):
         self.enable = self.cache_config.enable_prefix_caching
 
         # caches with different adapter should not be shared.
-        self._roots: Dict[str, Node] = dict()
-        self.leaves: Set[Node] = set()
+        self._roots: dict[str, Node] = dict()
+        self.leaves: set[Node] = set()
         self.stats = PrefixCacheStats()
 
     def hit_rate(self):
diff --git a/lmdeploy/pytorch/paging/eviction_helper/base_eviction_helper.py b/lmdeploy/pytorch/paging/eviction_helper/base_eviction_helper.py
index 3799d60a42..f075748f70 100644
--- a/lmdeploy/pytorch/paging/eviction_helper/base_eviction_helper.py
+++ b/lmdeploy/pytorch/paging/eviction_helper/base_eviction_helper.py
@@ -1,10 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
 
 from ...messages import SchedulerSequence
 from ..scheduler import Scheduler
 
-SeqList = List[SchedulerSequence]
+SeqList = list[SchedulerSequence]
 
 
 class BaseEvictionHelper:
@@ -21,6 +20,6 @@ def need_swap_in(self, seq: SchedulerSequence):
         """Sequence need swap in."""
         raise NotImplementedError('Not implemented.')
 
-    def evict_for_seq(self, seq: SchedulerSequence, evictable_seqs: List[SchedulerSequence], prealloc_size: int):
+    def evict_for_seq(self, seq: SchedulerSequence, evictable_seqs: list[SchedulerSequence], prealloc_size: int):
         """Evict seqs."""
         raise NotImplementedError('Not implemented.')
diff --git a/lmdeploy/pytorch/paging/eviction_helper/recompute_eviction_helper.py b/lmdeploy/pytorch/paging/eviction_helper/recompute_eviction_helper.py
index bdded115dd..4976eddb20 100644
--- a/lmdeploy/pytorch/paging/eviction_helper/recompute_eviction_helper.py
+++ b/lmdeploy/pytorch/paging/eviction_helper/recompute_eviction_helper.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
 
 from ...messages import SchedulerSequence
 from ..scheduler import Scheduler
@@ -17,7 +16,7 @@ def __init__(self, scheduler: Scheduler):
         else:
             self.evict_for_seq = self._evict_for_ssm
 
-    def _evict_for_seq_default(self, seq: SchedulerSequence, evictable_seqs: List[SchedulerSequence],
+    def _evict_for_seq_default(self, seq: SchedulerSequence, evictable_seqs: list[SchedulerSequence],
                                prealloc_size: int):
         """Evict seqs."""
         block_manager = self.block_manager
@@ -56,7 +55,7 @@ def _evict_for_seq_default(self, seq: SchedulerSequence, evictable_seqs: List[Sc
 
         return success
 
-    def _evict_for_ssm(self, seq: SchedulerSequence, evictable_seqs: List[SchedulerSequence], prealloc_size: int):
+    def _evict_for_ssm(self, seq: SchedulerSequence, evictable_seqs: list[SchedulerSequence], prealloc_size: int):
         """Evict seqs."""
         block_manager = self.block_manager
         state_manager = self.state_manager
diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py
index 9208b7cdf2..de42faca7d 100644
--- a/lmdeploy/pytorch/paging/scheduler.py
+++ b/lmdeploy/pytorch/paging/scheduler.py
@@ -4,7 +4,6 @@
 from collections import OrderedDict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Dict, List
 
 from torch.profiler import record_function
 
@@ -20,8 +19,8 @@
 
 logger = get_logger('lmdeploy')
 
-MapType = Dict[int, int]
-SeqList = List[SchedulerSequence]
+MapType = dict[int, int]
+SeqList = list[SchedulerSequence]
 
 
 @dataclass
@@ -50,10 +49,10 @@ def __init__(
     ) -> None:
         self.scheduler_config = scheduler_config
         self.cache_config = cache_config
-        self.sessions: Dict[int, SchedulerSession] = OrderedDict()
+        self.sessions: dict[int, SchedulerSession] = OrderedDict()
 
         # For Disaggregation
-        self.locked_sessions: Dict[int, SchedulerSession] = OrderedDict()
+        self.locked_sessions: dict[int, SchedulerSession] = OrderedDict()
 
         self.block_manager = build_block_manager(cache_config)
         self.block_trie = BlockTrie(self.cache_config, self.block_manager)
diff --git a/lmdeploy/pytorch/ray.py b/lmdeploy/pytorch/ray.py
index 6f9261f317..bb575df98d 100644
--- a/lmdeploy/pytorch/ray.py
+++ b/lmdeploy/pytorch/ray.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 import time
-from typing import Dict, List
 
 import ray
 from ray.util.placement_group import PlacementGroup
@@ -28,7 +27,7 @@ def get_device_str(device_type: str = None) -> str:
     return device_type
 
 
-def get_resource_kwargs(device_str: str, resource_used: float = 0.01) -> Dict[str, float]:
+def get_resource_kwargs(device_str: str, resource_used: float = 0.01) -> dict[str, float]:
     """Get resource kwargs."""
     if device_str == 'GPU':
         resource_kwargs = {'num_gpus': resource_used}
@@ -124,7 +123,7 @@ def init_ray_cluster(world_size: int, ray_address: str = None, dp: int = 1, devi
                 'The number of required %ss exceeds the total '
                 'number of available %ss in the placement group.', device_str, device_str)
         # Create a new placement group
-        placement_group_specs: List[Dict[str, float]] = ([{device_str: 1.0} for _ in range(world_size)])
+        placement_group_specs: list[dict[str, float]] = ([{device_str: 1.0} for _ in range(world_size)])
 
         # Pin at least one bundle to the local node.
         # This helps multi-node DP keep each dp_rank process's workers co-located with
diff --git a/lmdeploy/pytorch/spec_decode/base.py b/lmdeploy/pytorch/spec_decode/base.py
index 113f6b6ead..3ecfab5f82 100644
--- a/lmdeploy/pytorch/spec_decode/base.py
+++ b/lmdeploy/pytorch/spec_decode/base.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Dict
 
 import torch
 
@@ -52,7 +51,7 @@ def reset_graph_runner(self):
         'reset graph runner'
         pass
 
-    def update_main_model_outputs(self, output: Dict[str, torch.Tensor], model_inputs: ModelInputs):
+    def update_main_model_outputs(self, output: dict[str, torch.Tensor], model_inputs: ModelInputs):
         """Update outputs of main model."""
         if not self.is_enabled():
             hidden_states = output.pop('hidden_states')
diff --git a/lmdeploy/pytorch/spec_decode/proposers/base.py b/lmdeploy/pytorch/spec_decode/proposers/base.py
index aaac4e40ec..32bee78c6f 100644
--- a/lmdeploy/pytorch/spec_decode/proposers/base.py
+++ b/lmdeploy/pytorch/spec_decode/proposers/base.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import torch
 from mmengine import Registry
@@ -23,8 +23,8 @@
 def draft_model_forward(
     model: torch.nn.Module,
     inputs: ModelInputs,
-    model_config: Optional[ModelConfig] = None,
-    cache_engine: Optional[CacheEngine] = None,
+    model_config: ModelConfig | None = None,
+    cache_engine: CacheEngine | None = None,
 ):
     """Perform model forward."""
     stream = torch.cuda.current_stream()
@@ -86,7 +86,7 @@ def build_model(self, empty_init: bool, target_model: torch.nn.Module = None, bu
         self.target_model = target_model
 
     def get_outputs(self,
-                    model_outputs: Dict[str, torch.Tensor],
+                    model_outputs: dict[str, torch.Tensor],
                     model_inputs: ModelInputs,
                     extra_inputs: ExtraInputs = None):
         """Get outputs."""
@@ -103,7 +103,7 @@ def _forward(self, model_inputs: ModelInputs, cache_engine: CacheEngine = None):
         )
 
     def update_inputs_decoding(self, model_inputs: ModelInputs, extra_inputs: ExtraInputs, next_input_ids: torch.Tensor,
-                               target_hidden_states: torch.Tensor, model_metas: List[Any]):
+                               target_hidden_states: torch.Tensor, model_metas: list[Any]):
         """Update to decoding inputs."""
         model_inputs.is_decoding = True
         batch_size = model_inputs.seq_length.size(0)
diff --git a/lmdeploy/pytorch/spec_decode/proposers/deepseek_mtp.py b/lmdeploy/pytorch/spec_decode/proposers/deepseek_mtp.py
index de19beb761..09e4c08591 100644
--- a/lmdeploy/pytorch/spec_decode/proposers/deepseek_mtp.py
+++ b/lmdeploy/pytorch/spec_decode/proposers/deepseek_mtp.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict
 
 import torch
 
@@ -16,7 +15,7 @@
 class DeepseekMTP(BaseSpecProposer):
 
     def get_outputs(self,
-                    model_outputs: Dict[str, torch.Tensor],
+                    model_outputs: dict[str, torch.Tensor],
                     model_inputs: ModelInputs,
                     extra_inputs: ARSpecExtraInputs = None):
         """Get outputs."""
diff --git a/lmdeploy/pytorch/spec_decode/proposers/eagle3.py b/lmdeploy/pytorch/spec_decode/proposers/eagle3.py
index f032496f8f..db1011727e 100644
--- a/lmdeploy/pytorch/spec_decode/proposers/eagle3.py
+++ b/lmdeploy/pytorch/spec_decode/proposers/eagle3.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict
 
 import torch
 
@@ -32,7 +31,7 @@ def get_target_hidden_size(self, model_config: ModelConfig):
         return hidden_size * 3
 
     def get_outputs(self,
-                    model_outputs: Dict[str, torch.Tensor],
+                    model_outputs: dict[str, torch.Tensor],
                     model_inputs: ModelInputs,
                     extra_inputs: ExtraInputs = None):
         """Get outputs."""
diff --git a/lmdeploy/pytorch/spec_decode/reject_sampler.py b/lmdeploy/pytorch/spec_decode/reject_sampler.py
index b2c4e34946..8bccd258c6 100644
--- a/lmdeploy/pytorch/spec_decode/reject_sampler.py
+++ b/lmdeploy/pytorch/spec_decode/reject_sampler.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import enum
-from typing import Optional
 
 import torch
 from torch import LongTensor, Tensor, nn
@@ -24,7 +23,7 @@ def forward(
         target_logits: Tensor,
         draft_token_ids: LongTensor,
         bonus_token_ids: LongTensor,
-        draft_probs: Optional[Tensor] = None,
+        draft_probs: Tensor | None = None,
     ):
         """forward
         Args:
@@ -49,7 +48,7 @@ def rejection_sample(
     draft_token_ids: LongTensor,
     bonus_token_ids: LongTensor,
     sample_policy: SamplePolicy = SamplePolicy.ALL_GREEDY,
-    draft_probs: Optional[Tensor] = None,
+    draft_probs: Tensor | None = None,
 ):
     """rejection sample
     Args:
diff --git a/lmdeploy/pytorch/spec_decode/spec_agent.py b/lmdeploy/pytorch/spec_decode/spec_agent.py
index 51739d05d5..16addc9c98 100644
--- a/lmdeploy/pytorch/spec_decode/spec_agent.py
+++ b/lmdeploy/pytorch/spec_decode/spec_agent.py
@@ -131,7 +131,7 @@ async def _async_forward(self, inputs: ModelInputs):
         """Model forward.
 
         Args:
-            inputs (Dict): The input data comes from _make_inputs.
+            inputs (dict): The input data comes from _make_inputs.
         """
         output = self._forward_impl(inputs)
         await asyncio.sleep(0)
@@ -142,7 +142,7 @@ async def _async_model_forward(self, inputs: ModelInputs, extra_inputs: ARSpecEx
         """Model forward.
 
         Args:
-            inputs (Dict): The input data comes from _make_inputs.
+            inputs (dict): The input data comes from _make_inputs.
         """
         outputs = await self._async_forward(inputs)
         if inputs.is_chunk:
diff --git a/lmdeploy/pytorch/strategies/ar/__init__.py b/lmdeploy/pytorch/strategies/ar/__init__.py
index b593107c2e..d18cf4e952 100644
--- a/lmdeploy/pytorch/strategies/ar/__init__.py
+++ b/lmdeploy/pytorch/strategies/ar/__init__.py
@@ -5,12 +5,12 @@
 from lmdeploy.pytorch.strategies.base.sequence import SequenceStrategy
 
 if TYPE_CHECKING:
+    from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig
     from lmdeploy.pytorch.strategies.base.cudagraph import CudagraphStrategy
+    from lmdeploy.pytorch.strategies.base.engine import EngineStrategy
+    from lmdeploy.pytorch.strategies.base.model_agent import ModelAgentStrategy
     from lmdeploy.pytorch.strategies.base.model_inputs import ModelInputsStrategy
     from lmdeploy.pytorch.strategies.base.sampling import SamplingStrategy
-    from lmdeploy.pytorch.strategies.base.model_agent import ModelAgentStrategy
-    from lmdeploy.pytorch.strategies.base.engine import EngineStrategy
-    from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig
 
 from ..base import StrategyFactoryBase
 
diff --git a/lmdeploy/pytorch/strategies/ar/model_agent.py b/lmdeploy/pytorch/strategies/ar/model_agent.py
index 9c7abb5887..df11ac15a7 100644
--- a/lmdeploy/pytorch/strategies/ar/model_agent.py
+++ b/lmdeploy/pytorch/strategies/ar/model_agent.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -14,7 +14,7 @@
 
 from ..base.model_agent import ExtraInputs, ExtraOutputs, ModelAgentStrategy, StoppingCriteria
 
-SeqList = List[SchedulerSequence]
+SeqList = list[SchedulerSequence]
 
 
 def get_model_inputs_next_decoding(inputs: ModelInputs, input_ids: torch.Tensor, max_q_seqlen: int,
@@ -74,8 +74,8 @@ def update(self, delta: ModelInputsDelta):
     def step(self,
              token_ids: torch.Tensor,
              stop_words: torch.Tensor,
-             inputs: Optional[ModelInputs] = None,
-             extra_inputs: Optional[ARExtraInputs] = None):
+             inputs: ModelInputs | None = None,
+             extra_inputs: ARExtraInputs | None = None):
         """Check whether to stop generation."""
         num_appendable_ids = self.num_appendable_ids - 1
         stopped = num_appendable_ids <= 0
@@ -105,7 +105,7 @@ def slice_outputs(self, inputs: torch.Tensor, seq_length: torch.LongTensor) -> t
         return inputs[last_idx]
 
     def slice_extra_inputs(self, extra_inputs: ARExtraInputs, model_inputs: ModelInputs,
-                           model_outputs: Dict[str, torch.Tensor], **kwargs) -> ARExtraInputs:
+                           model_outputs: dict[str, torch.Tensor], **kwargs) -> ARExtraInputs:
         """Slice outputs."""
         return extra_inputs
 
@@ -145,7 +145,7 @@ def update_prefill_for_next_step(
         next_token_ids: torch.Tensor,
         model_metas: Any,
         extra_outputs: ARExtraOutputs,
-    ) -> Tuple['ModelInputs', ARExtraInputs]:
+    ) -> tuple['ModelInputs', ARExtraInputs]:
         """Step next decoding."""
         inputs = get_model_inputs_next_decoding(model_inputs, next_token_ids, max_q_seqlen=1, model_metas=model_metas)
         return inputs, extra_inputs
diff --git a/lmdeploy/pytorch/strategies/ar/model_inputs.py b/lmdeploy/pytorch/strategies/ar/model_inputs.py
index 7c1910311a..e44a73b628 100644
--- a/lmdeploy/pytorch/strategies/ar/model_inputs.py
+++ b/lmdeploy/pytorch/strategies/ar/model_inputs.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
 
 import numpy as np
 import torch
@@ -93,10 +92,10 @@ def index_select(inputs: ModelInputs,
                      indices: torch.Tensor,
                      indice_cpu: np.ndarray = None,
                      block_offsets: torch.Tensor = None,
-                     max_q_seqlen: Optional[int] = None,
-                     max_kv_seqlen: Optional[int] = None,
-                     sum_kv_seqlen: Optional[int] = None,
-                     num_ignored_history: Optional[torch.Tensor] = None):
+                     max_q_seqlen: int | None = None,
+                     max_kv_seqlen: int | None = None,
+                     sum_kv_seqlen: int | None = None,
+                     num_ignored_history: torch.Tensor | None = None):
         """Index select."""
         assert inputs.is_decoding, 'Only support index_select in decoding.'
 
diff --git a/lmdeploy/pytorch/strategies/ar/sequence.py b/lmdeploy/pytorch/strategies/ar/sequence.py
index b9b277f961..affb3205d2 100644
--- a/lmdeploy/pytorch/strategies/ar/sequence.py
+++ b/lmdeploy/pytorch/strategies/ar/sequence.py
@@ -1,20 +1,28 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import time
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import numpy as np
 from torch import Tensor
 
 from lmdeploy.pytorch.disagg.conn.protocol import MigrationRequest
 from lmdeploy.pytorch.engine.model_agent import BatchedOutputs
-from lmdeploy.pytorch.messages import (InputEmbeddings, MessageStatus, MultiModalInputs, SamplingParam,
-                                       SchedulerSequence, SchedulerSession, UpdateTokenMode, _to_ndarray)
+from lmdeploy.pytorch.messages import (
+    InputEmbeddings,
+    MessageStatus,
+    MultiModalInputs,
+    SamplingParam,
+    SchedulerSequence,
+    SchedulerSession,
+    UpdateTokenMode,
+    _to_ndarray,
+)
 from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta
 
 from ..base.sequence import SequenceStrategy
 
-SeqList = List[SchedulerSequence]
+SeqList = list[SchedulerSequence]
 
 
 @dataclass
@@ -23,8 +31,8 @@ class SchedulerSequenceDefault(SchedulerSequence):
     def update_token_ids(self,
                          token_ids: Tensor,
                          multimodals: MultiModalInputs = None,
-                         embeddings: List[InputEmbeddings] = None,
-                         model_meta: Dict[str, Any] = None,
+                         embeddings: list[InputEmbeddings] = None,
+                         model_meta: dict[str, Any] = None,
                          mode: UpdateTokenMode = UpdateTokenMode.INPUTS,
                          routed_experts: np.ndarray = None,
                          **kwargs):
@@ -85,7 +93,7 @@ def make_sequence(self,
                       session: 'SchedulerSession',
                       sampling_param: 'SamplingParam' = None,
                       adapter_name: str = None,
-                      migration_request: Optional[MigrationRequest] = None,
+                      migration_request: MigrationRequest | None = None,
                       resp_cache: bool = False,
                       preserve_cache: bool = False) -> 'SchedulerSequence':
         """Make sequence."""
diff --git a/lmdeploy/pytorch/strategies/ar_spec/__init__.py b/lmdeploy/pytorch/strategies/ar_spec/__init__.py
index 416d20460c..5f692e33a0 100644
--- a/lmdeploy/pytorch/strategies/ar_spec/__init__.py
+++ b/lmdeploy/pytorch/strategies/ar_spec/__init__.py
@@ -5,12 +5,12 @@
 from lmdeploy.pytorch.strategies.base.sequence import SequenceStrategy
 
 if TYPE_CHECKING:
+    from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig
     from lmdeploy.pytorch.strategies.base.cudagraph import CudagraphStrategy
+    from lmdeploy.pytorch.strategies.base.engine import EngineStrategy
+    from lmdeploy.pytorch.strategies.base.model_agent import ModelAgentStrategy
     from lmdeploy.pytorch.strategies.base.model_inputs import ModelInputsStrategy
     from lmdeploy.pytorch.strategies.base.sampling import SamplingStrategy
-    from lmdeploy.pytorch.strategies.base.model_agent import ModelAgentStrategy
-    from lmdeploy.pytorch.strategies.base.engine import EngineStrategy
-    from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig
 
 from ..base import StrategyFactoryBase
 
diff --git a/lmdeploy/pytorch/strategies/ar_spec/model_agent.py b/lmdeploy/pytorch/strategies/ar_spec/model_agent.py
index eeb2e5934e..08914072cc 100644
--- a/lmdeploy/pytorch/strategies/ar_spec/model_agent.py
+++ b/lmdeploy/pytorch/strategies/ar_spec/model_agent.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -15,7 +15,7 @@
 from ..ar.model_agent import ARStoppingCriteria, get_model_inputs_next_decoding
 from ..base.model_agent import ExtraInputs, ExtraOutputs, ModelAgentStrategy
 
-SeqList = List[SchedulerSequence]
+SeqList = list[SchedulerSequence]
 
 
 @dataclass
@@ -84,8 +84,8 @@ def update(self, delta: ModelInputsDelta):
     def step(self,
              next_token_ids: torch.Tensor,
              stop_words: torch.Tensor,
-             inputs: Optional[ModelInputs] = None,
-             extra_inputs: Optional[ARSpecExtraInputs] = None):
+             inputs: ModelInputs | None = None,
+             extra_inputs: ARSpecExtraInputs | None = None):
         """Check whether to stop generation."""
         token_ids = extra_inputs.output_token_ids
 
@@ -128,7 +128,7 @@ def slice_outputs(self, inputs: torch.Tensor, seq_length: torch.LongTensor) -> t
         return inputs[last_idx]
 
     def slice_extra_inputs(self, extra_inputs: ARSpecExtraInputs, model_inputs: ModelInputs,
-                           model_outputs: Dict[str, torch.Tensor], **kwargs) -> ARSpecExtraInputs:
+                           model_outputs: dict[str, torch.Tensor], **kwargs) -> ARSpecExtraInputs:
         """Slice outputs."""
         extra_inputs = ARSpecExtraInputs()
         extra_inputs.target_hidden_states = model_outputs.get('hidden_states')
@@ -178,7 +178,7 @@ def update_prefill_for_next_step(
         next_token_ids: torch.Tensor,
         model_metas: Any,
         extra_outputs: ARSpecExtraOutputs,
-    ) -> Tuple['ModelInputs', ARSpecExtraInputs]:
+    ) -> tuple['ModelInputs', ARSpecExtraInputs]:
         """Step next decoding."""
         next_token_ids = next_token_ids[:, None]
         next_token_ids = torch.cat([next_token_ids, extra_outputs.draft_token_ids], dim=-1)
diff --git a/lmdeploy/pytorch/strategies/ar_spec/sequence.py b/lmdeploy/pytorch/strategies/ar_spec/sequence.py
index 7089bce3d0..ecb095db72 100644
--- a/lmdeploy/pytorch/strategies/ar_spec/sequence.py
+++ b/lmdeploy/pytorch/strategies/ar_spec/sequence.py
@@ -1,20 +1,27 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import time
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import numpy as np
 from torch import Tensor
 
 from lmdeploy.pytorch.disagg.conn.protocol import MigrationRequest
 from lmdeploy.pytorch.engine.model_agent import BatchedOutputs
-from lmdeploy.pytorch.messages import (InputEmbeddings, MessageStatus, MultiModalInputs, SamplingParam,
-                                       SchedulerSession, UpdateTokenMode, _to_ndarray)
+from lmdeploy.pytorch.messages import (
+    InputEmbeddings,
+    MessageStatus,
+    MultiModalInputs,
+    SamplingParam,
+    SchedulerSession,
+    UpdateTokenMode,
+    _to_ndarray,
+)
 from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta
 
 from ..ar.sequence import ARSequenceStrategy, SchedulerSequenceDefault
 
-SeqList = List['SchedulerSequenceARSpec']
+SeqList = list['SchedulerSequenceARSpec']
 
 
 @dataclass
@@ -110,8 +117,8 @@ def _update_token_ids_decode(self, token_ids: np.ndarray, draft_token_ids: np.nd
     def update_token_ids(self,
                          token_ids: Tensor,
                          multimodals: MultiModalInputs = None,
-                         embeddings: List[InputEmbeddings] = None,
-                         model_meta: Dict[str, Any] = None,
+                         embeddings: list[InputEmbeddings] = None,
+                         model_meta: dict[str, Any] = None,
                          draft_token_ids: Tensor = None,
                          mode: UpdateTokenMode = UpdateTokenMode.INPUTS,
                          **kwargs):
@@ -144,7 +151,7 @@ def make_sequence(self,
                       session: 'SchedulerSession',
                       sampling_param: 'SamplingParam' = None,
                       adapter_name: str = None,
-                      migration_request: Optional[MigrationRequest] = None,
+                      migration_request: MigrationRequest | None = None,
                       resp_cache: bool = False,
                       preserve_cache: bool = False) -> 'SchedulerSequenceARSpec':
         """Make sequence."""
diff --git a/lmdeploy/pytorch/strategies/base/model_agent.py b/lmdeploy/pytorch/strategies/base/model_agent.py
index 471f6e5a66..1a7796fff0 100644
--- a/lmdeploy/pytorch/strategies/base/model_agent.py
+++ b/lmdeploy/pytorch/strategies/base/model_agent.py
@@ -2,7 +2,7 @@
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from dataclasses import dataclass, fields
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Optional
 
 import numpy as np
 import torch
@@ -12,7 +12,7 @@
     from lmdeploy.pytorch.engine.logits_process import SamplingInputs
     from lmdeploy.pytorch.messages import SchedulerSequence
     from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta
-    SeqList = List[SchedulerSequence]
+    SeqList = list[SchedulerSequence]
 
 
 def to_device(self, device: str, non_blocking: bool = False):
@@ -103,7 +103,7 @@ def step(self,
              token_ids: torch.Tensor,
              stop_words: torch.Tensor,
              inputs: Optional['ModelInputs'] = None,
-             extra_inputs: Optional[ExtraInputs] = None):
+             extra_inputs: ExtraInputs | None = None):
         """Check whether to stop generation."""
         pass
 
@@ -122,7 +122,7 @@ def slice_outputs(self, inputs: torch.Tensor, seq_length: torch.LongTensor) -> t
 
     @abstractmethod
     def slice_extra_inputs(self, extra_inputs: ExtraInputs, model_inputs: 'ModelInputs',
-                           model_outputs: Dict[str, torch.Tensor], **kwargs) -> ExtraInputs:
+                           model_outputs: dict[str, torch.Tensor], **kwargs) -> ExtraInputs:
         """Slice outputs."""
         pass
 
@@ -163,14 +163,14 @@ def update_prefill_for_next_step(
         next_token_ids: torch.Tensor,
         model_metas: Any,
         extra_outputs: ExtraOutputs,
-    ) -> Tuple['ModelInputs', ExtraInputs]:
+    ) -> tuple['ModelInputs', ExtraInputs]:
         """Step next decoding."""
         pass
 
     @abstractmethod
     def update_decoding_for_next_step(self, model_inputs: 'ModelInputs', next_token_ids: torch.Tensor, model_metas: Any,
                                       extra_inputs: ExtraInputs,
-                                      extra_outputs: ExtraOutputs) -> Tuple['ModelInputs', ExtraInputs]:
+                                      extra_outputs: ExtraOutputs) -> tuple['ModelInputs', ExtraInputs]:
         """Step next inputs."""
         pass
 
diff --git a/lmdeploy/pytorch/strategies/base/sampling.py b/lmdeploy/pytorch/strategies/base/sampling.py
index 2948627870..bf6c4aac78 100644
--- a/lmdeploy/pytorch/strategies/base/sampling.py
+++ b/lmdeploy/pytorch/strategies/base/sampling.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
-from typing import List
 
 import torch
 
@@ -10,7 +9,7 @@
 
 from .model_agent import ExtraInputs
 
-SeqList = List[SchedulerSequence]
+SeqList = list[SchedulerSequence]
 
 
 class SamplingStrategy(ABC):
diff --git a/lmdeploy/pytorch/strategies/base/sequence.py b/lmdeploy/pytorch/strategies/base/sequence.py
index 8a19e69356..46fec916af 100644
--- a/lmdeploy/pytorch/strategies/base/sequence.py
+++ b/lmdeploy/pytorch/strategies/base/sequence.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING
 
 from lmdeploy.pytorch.disagg.conn.protocol import MigrationRequest
 
@@ -8,7 +8,7 @@
     from lmdeploy.pytorch.engine.model_agent import BatchedOutputs
     from lmdeploy.pytorch.messages import SamplingParam, SchedulerSequence, SchedulerSession
     from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta
-    SeqList = List[SchedulerSequence]
+    SeqList = list[SchedulerSequence]
 
 
 class SequenceStrategy(ABC):
@@ -19,7 +19,7 @@ def make_sequence(self,
                       session: 'SchedulerSession',
                       sampling_param: 'SamplingParam' = None,
                       adapter_name: str = None,
-                      migration_request: Optional[MigrationRequest] = None,
+                      migration_request: MigrationRequest | None = None,
                       resp_cache: bool = False,
                       preserve_cache: bool = False) -> 'SchedulerSequence':
         """Make sequence."""
diff --git a/lmdeploy/pytorch/strategies/dllm/__init__.py b/lmdeploy/pytorch/strategies/dllm/__init__.py
index dc0395a017..e1c9b9adcd 100644
--- a/lmdeploy/pytorch/strategies/dllm/__init__.py
+++ b/lmdeploy/pytorch/strategies/dllm/__init__.py
@@ -6,12 +6,12 @@
 from lmdeploy.utils import get_logger
 
 if TYPE_CHECKING:
+    from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig
     from lmdeploy.pytorch.strategies.base.cudagraph import CudagraphStrategy
+    from lmdeploy.pytorch.strategies.base.engine import EngineStrategy
+    from lmdeploy.pytorch.strategies.base.model_agent import ModelAgentStrategy
     from lmdeploy.pytorch.strategies.base.model_inputs import ModelInputsStrategy
     from lmdeploy.pytorch.strategies.base.sampling import SamplingStrategy
-    from lmdeploy.pytorch.strategies.base.model_agent import ModelAgentStrategy
-    from lmdeploy.pytorch.strategies.base.engine import EngineStrategy
-    from lmdeploy.pytorch.config import CacheConfig, SchedulerConfig
 
 from ..base import StrategyFactoryBase
 
diff --git a/lmdeploy/pytorch/strategies/dllm/model_agent.py b/lmdeploy/pytorch/strategies/dllm/model_agent.py
index e1588300a2..3371997341 100644
--- a/lmdeploy/pytorch/strategies/dllm/model_agent.py
+++ b/lmdeploy/pytorch/strategies/dllm/model_agent.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any
 
 import numpy as np
 import torch
@@ -18,7 +18,7 @@
 from ..base.model_agent import ExtraInputs, ExtraOutputs, ModelAgentStrategy, StoppingCriteria
 from .unmasking import UnmaskingProcessor
 
-SeqList = List[SchedulerSequence]
+SeqList = list[SchedulerSequence]
 
 
 def get_model_inputs_next_decoding(inputs: ModelInputs, input_ids: torch.Tensor, max_q_seqlen,
@@ -121,8 +121,8 @@ def update(self, delta: 'ModelInputsDelta') -> 'DLLMStoppingCriteria':
     def step(self,
              token_ids: torch.Tensor,
              stop_words: torch.Tensor,
-             inputs: Optional[ModelInputs] = None,
-             extra_inputs: Optional[DLLMExtraInputs] = None):
+             inputs: ModelInputs | None = None,
+             extra_inputs: DLLMExtraInputs | None = None):
         """Check whether to stop generation."""
         num_appendable_ids = self.num_appendable_ids
         output_start_pos = self.output_start_pos
@@ -198,7 +198,7 @@ def slice_outputs(self, inputs: torch.Tensor, seq_length: torch.LongTensor) -> t
         return inputs
 
     def slice_extra_inputs(self, extra_inputs: DLLMExtraInputs, model_inputs: ModelInputs,
-                           model_outputs: Dict[str, torch.Tensor], **kwargs) -> DLLMExtraInputs:
+                           model_outputs: dict[str, torch.Tensor], **kwargs) -> DLLMExtraInputs:
         """Slice outputs."""
         dllm_mask = self.slice_outputs(extra_inputs.dllm_mask, model_inputs.seq_length)
         return DLLMExtraInputs(dllm_mask=dllm_mask)
@@ -269,7 +269,7 @@ def update_prefill_for_next_step(
         next_token_ids: torch.Tensor,
         model_metas: Any,
         extra_outputs: DLLMExtraOutputs,
-    ) -> Tuple['ModelInputs', DLLMExtraInputs]:
+    ) -> tuple['ModelInputs', DLLMExtraInputs]:
         """Step next decoding."""
         dllm_mask = extra_outputs.dllm_mask
         next_token_ids, dllm_mask, step_seqlens = self._update_dllm(next_token_ids, dllm_mask, model_inputs.seq_length)
diff --git a/lmdeploy/pytorch/strategies/dllm/sampling.py b/lmdeploy/pytorch/strategies/dllm/sampling.py
index d7c8bc4716..ab5174f017 100644
--- a/lmdeploy/pytorch/strategies/dllm/sampling.py
+++ b/lmdeploy/pytorch/strategies/dllm/sampling.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
 
 import numpy as np
 import torch
@@ -12,7 +11,7 @@
 from ..ar.sampling import ARSamplingStrategy
 from .model_agent import DLLMExtraInputs
 
-SeqList = List[SchedulerSequence]
+SeqList = list[SchedulerSequence]
 
 
 class DLLMSamplingStrategy(ARSamplingStrategy):
diff --git a/lmdeploy/pytorch/strategies/dllm/sequence.py b/lmdeploy/pytorch/strategies/dllm/sequence.py
index 03ad19e75d..4b6ac470b4 100644
--- a/lmdeploy/pytorch/strategies/dllm/sequence.py
+++ b/lmdeploy/pytorch/strategies/dllm/sequence.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import time
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import numpy as np
 from torch import Tensor
@@ -9,14 +9,22 @@
 from lmdeploy.pytorch import consts
 from lmdeploy.pytorch.disagg.conn.protocol import MigrationRequest
 from lmdeploy.pytorch.engine.model_agent import BatchedOutputs
-from lmdeploy.pytorch.messages import (HistoryTokenIds, InputEmbeddings, MessageStatus, MultiModalInputs, SamplingParam,
-                                       SchedulerSession, UpdateTokenMode, _to_ndarray)
+from lmdeploy.pytorch.messages import (
+    HistoryTokenIds,
+    InputEmbeddings,
+    MessageStatus,
+    MultiModalInputs,
+    SamplingParam,
+    SchedulerSession,
+    UpdateTokenMode,
+    _to_ndarray,
+)
 from lmdeploy.pytorch.model_inputs import ModelInputs, ModelInputsDelta
 
 from ..ar.sequence import SchedulerSequenceDefault
 from ..base.sequence import SequenceStrategy
 
-SeqList = List['SchedulerSequenceDLLM']
+SeqList = list['SchedulerSequenceDLLM']
 
 DLLM_MASKED = consts.DLLM_MASKED
 DLLM_UNMASKED = consts.DLLM_UNMASKED
@@ -165,8 +173,8 @@ def _update_token_ids_prefill(self, token_ids: np.ndarray, dllm_mask: np.ndarray
     def update_token_ids(self,
                          token_ids: Tensor,
                          multimodals: MultiModalInputs = None,
-                         embeddings: List[InputEmbeddings] = None,
-                         model_meta: Dict[str, Any] = None,
+                         embeddings: list[InputEmbeddings] = None,
+                         model_meta: dict[str, Any] = None,
                          dllm_mask: Tensor = None,
                          mode: UpdateTokenMode = UpdateTokenMode.INPUTS,
                          **kwargs):
@@ -216,7 +224,7 @@ def make_sequence(self,
                       session: 'SchedulerSession',
                       sampling_param: 'SamplingParam' = None,
                       adapter_name: str = None,
-                      migration_request: Optional[MigrationRequest] = None,
+                      migration_request: MigrationRequest | None = None,
                       resp_cache: bool = False,
                       preserve_cache: bool = False) -> 'SchedulerSequenceDLLM':
         """Make sequence."""
diff --git a/lmdeploy/pytorch/tools/utils.py b/lmdeploy/pytorch/tools/utils.py
index 6d0c9d836c..8b71f75859 100644
--- a/lmdeploy/pytorch/tools/utils.py
+++ b/lmdeploy/pytorch/tools/utils.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from contextlib import contextmanager
-from typing import List
 
 
 class Timer:
@@ -187,7 +186,7 @@ def _print_meta(out: Response):
     # Main loop
     print(colored('━' * term_size, border_color))
 
-    outputs: List[Response] = outputs
+    outputs: list[Response] = outputs
     for idx, out in enumerate(outputs):
         header = f'OUTPUT [{idx + 1}/{len(outputs)}]'
         header_formatted = colored(f'✦ {header}', 'light_magenta', attrs=['bold'])
diff --git a/lmdeploy/pytorch/transformers/__init__.py b/lmdeploy/pytorch/transformers/__init__.py
index bfafdb1899..994d8ffd0d 100644
--- a/lmdeploy/pytorch/transformers/__init__.py
+++ b/lmdeploy/pytorch/transformers/__init__.py
@@ -6,7 +6,7 @@
 from lmdeploy.utils import get_logger
 
 
-@lru_cache()
+@lru_cache
 def register_config(model_type: str):
     if model_type == 'deepseek_v32':
         from lmdeploy.pytorch.transformers.configuration_deepseek_v32 import DeepseekV32Config
diff --git a/lmdeploy/pytorch/utils.py b/lmdeploy/pytorch/utils.py
index c72aafe7da..9c60d2d001 100644
--- a/lmdeploy/pytorch/utils.py
+++ b/lmdeploy/pytorch/utils.py
@@ -2,9 +2,10 @@
 # modify from: https://github.com/vllm-project/vllm
 import asyncio
 import inspect
+from collections.abc import Sequence
 from contextlib import contextmanager
 from inspect import Parameter, Signature
-from typing import Dict, Generic, Optional, Sequence, TypeVar
+from typing import Generic, TypeVar
 
 import psutil
 
@@ -26,7 +27,7 @@ def get_cpu_memory() -> int:
     return psutil.virtual_memory().total
 
 
-def bind_sigature(input_names: str, args: Sequence, kwargs: Dict):
+def bind_sigature(input_names: str, args: Sequence, kwargs: dict):
     """Bind args and kwargs to given input names."""
     kind = inspect._ParameterKind.POSITIONAL_OR_KEYWORD
 
@@ -59,14 +60,14 @@ def get_instance(*args, **kwargs):
 class CtxMgrBase(Generic[T]):
     """Context manager base class."""
 
-    def __init__(self, default: Optional[T] = None):
+    def __init__(self, default: T | None = None):
         self._context = default
 
-    def current_context(self) -> Optional[T]:
+    def current_context(self) -> T | None:
         """Get current context."""
         return self._context
 
-    def set_context(self, context: Optional[T]):
+    def set_context(self, context: T | None):
         """Set current context."""
         self._context = context
 
diff --git a/lmdeploy/pytorch/weight_loader/model_weight_loader.py b/lmdeploy/pytorch/weight_loader/model_weight_loader.py
index 4b6d040a8d..cf2adde982 100644
--- a/lmdeploy/pytorch/weight_loader/model_weight_loader.py
+++ b/lmdeploy/pytorch/weight_loader/model_weight_loader.py
@@ -68,7 +68,7 @@ def _get_weight_map(model_path: str, weight_type: str):
     else:
         raise RuntimeError(f'Unsupported weight type: {weight_type}.')
 
-    with open(load_index, mode='r', encoding='utf-8') as f:
+    with open(load_index, encoding='utf-8') as f:
         index = json.load(f)
 
     weight_map = index['weight_map']
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
index 05c3485fdb..4a4f466d92 100644
--- a/lmdeploy/serve/core/async_engine.py
+++ b/lmdeploy/serve/core/async_engine.py
@@ -6,19 +6,29 @@
 import random
 from contextlib import asynccontextmanager
 from copy import deepcopy
-from typing import Any, Dict, List, Literal
+from typing import Any, Literal
 
 import torch
 
 from lmdeploy.archs import get_model_arch
 from lmdeploy.logger import RequestLogger
-from lmdeploy.messages import (EngineOutput, GenerationConfig, PytorchEngineConfig, Response, ResponseType,
-                               SpeculativeConfig, TurbomindEngineConfig)
+from lmdeploy.messages import (
+    EngineOutput,
+    GenerationConfig,
+    PytorchEngineConfig,
+    Response,
+    ResponseType,
+    SpeculativeConfig,
+    TurbomindEngineConfig,
+)
 from lmdeploy.metrics.metrics_processor import metrics_processor
 from lmdeploy.metrics.stats import IterationStats, RequestStats, SpeculativeDecodingStats
 from lmdeploy.model import ChatTemplateConfig, get_chat_template
-from lmdeploy.pytorch.disagg.conn.protocol import (DistServeConnectionRequest, DistServeDropConnectionRequest,
-                                                   DistServeInitRequest)
+from lmdeploy.pytorch.disagg.conn.protocol import (
+    DistServeConnectionRequest,
+    DistServeDropConnectionRequest,
+    DistServeInitRequest,
+)
 from lmdeploy.serve.managers import Session, SessionManager
 from lmdeploy.serve.processors import MultimodalProcessor
 from lmdeploy.tokenizer import DetokenizeState, Tokenizer
@@ -37,11 +47,11 @@ class GenOut:
     input_token_len: int
     generate_token_len: int
     finish_reason: Literal['stop', 'length', 'error'] | None = None
-    token_ids: List[int] | None = None
-    logprobs: List[Dict[int, float]] | None = None
+    token_ids: list[int] | None = None
+    logprobs: list[dict[int, float]] | None = None
     logits: Any = None
     last_hidden_state: Any = None
-    cache_block_ids: List[int] | None = None  # for disaggregation
+    cache_block_ids: list[int] | None = None  # for disaggregation
     routed_experts: Any = None  # for RL router replay
 
     def to_response(self, index: int = 0) -> Response:
@@ -218,7 +228,7 @@ def sleep(self, level: int = 1):
         self.sleeping_tags = {'weights', 'kv_cache'}
         self.is_sleeping = True
 
-    def wakeup(self, tags: List[str] | None = None):
+    def wakeup(self, tags: list[str] | None = None):
         """Wake up the model.
 
         Args:
@@ -282,7 +292,7 @@ async def generate(
             messages,
             session_id: int | Session,
             gen_config: GenerationConfig | None = None,
-            tools: List[object] | None = None,
+            tools: list[object] | None = None,
             reasoning_effort: Literal['low', 'medium', 'high'] | None = None,
             stream_response: bool = True,
             sequence_start: bool = True,
@@ -291,10 +301,10 @@ async def generate(
             do_preprocess: bool = True,
             adapter_name: str | None = None,
             rewind_stop_tokens: bool = False,
-            input_ids: List | None = None,
+            input_ids: list | None = None,
             enable_thinking: bool | None = None,
-            chat_template_kwargs: Dict | None = None,
-            mm_processor_kwargs: Dict[str, Any] | None = None,
+            chat_template_kwargs: dict | None = None,
+            mm_processor_kwargs: dict[str, Any] | None = None,
             **kwargs):
         """Generate responses.
 
@@ -577,21 +587,21 @@ def free_cache(self, session_id: int):
     def p2p_initialize(self, init_request: DistServeInitRequest):
         return self.engine.p2p_initialize(init_request)
 
-    def p2p_connect(self, conn_request: List[DistServeConnectionRequest]):
+    def p2p_connect(self, conn_request: list[DistServeConnectionRequest]):
         return self.engine.p2p_connect(conn_request)
 
-    def p2p_drop_connect(self, drop_conn_request: List[DistServeDropConnectionRequest]):
+    def p2p_drop_connect(self, drop_conn_request: list[DistServeDropConnectionRequest]):
         return self.engine.p2p_drop_connect(drop_conn_request)
 
     """ DistServe Async Engine API End """
 
-    async def async_get_reward_score(self, input_ids: List) -> List[float]:
+    async def async_get_reward_score(self, input_ids: list) -> list[float]:
         """Async version of get_reward_score."""
         supported_reward_models = ['InternLM2ForRewardModel', 'Qwen2ForRewardModel']
         if self.arch not in supported_reward_models:
             raise ValueError(f'{self.arch} is not in reward model list: {supported_reward_models}')
-        assert isinstance(input_ids, List)
-        assert all(isinstance(x, int) for x in input_ids) or all(isinstance(x, List) for x in input_ids)
+        assert isinstance(input_ids, list)
+        assert all(isinstance(x, int) for x in input_ids) or all(isinstance(x, list) for x in input_ids)
         # Make input_ids a list of token_id list
         input_ids = [input_ids] if isinstance(input_ids[0], int) else input_ids
 
@@ -603,10 +613,10 @@ async def async_get_reward_score(self, input_ids: List) -> List[float]:
 
     async def async_get_logits(self,
                                input_ids,
-                               sessions: List['Session'] | None = None,
+                               sessions: list['Session'] | None = None,
                                sequence_start: bool = True,
-                               sequence_end: bool = True) -> List[torch.Tensor]:
-        assert input_ids and all(isinstance(_, List) for _ in input_ids)
+                               sequence_end: bool = True) -> list[torch.Tensor]:
+        assert input_ids and all(isinstance(_, list) for _ in input_ids)
         assert sessions is None or (len(sessions) == len(input_ids))
 
         logits = [None] * len(input_ids)
diff --git a/lmdeploy/serve/managers/session_manager.py b/lmdeploy/serve/managers/session_manager.py
index 7dfefc767c..0ac7e1465f 100644
--- a/lmdeploy/serve/managers/session_manager.py
+++ b/lmdeploy/serve/managers/session_manager.py
@@ -5,7 +5,7 @@
 import itertools
 import weakref
 from contextlib import asynccontextmanager
-from typing import Any, List, Tuple
+from typing import Any
 
 from lmdeploy.messages import GenerationConfig, Response
 from lmdeploy.serve.core.exceptions import SafeRunException
@@ -21,7 +21,7 @@ def __init__(self, session_id: int, session_mgr: SessionManager, **kwargs):
         self.session_id = session_id
         self.prompt: Any = None
         self.response: Response | None = None
-        self.history: List[Tuple[Any, str]] = []
+        self.history: list[tuple[Any, str]] = []
         self.gen_config: GenerationConfig | None = None
         self.step: int = 0
         # event to wait for the session to be active
diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py
index b03161f261..fbe5eaf8b8 100644
--- a/lmdeploy/serve/openai/api_client.py
+++ b/lmdeploy/serve/openai/api_client.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
-from typing import Any, Dict, List, Optional, Union
+from typing import Any
 
 import requests
 
@@ -45,7 +45,7 @@ class APIClient:
             api key will be used.
     """
 
-    def __init__(self, api_server_url: str, api_key: Optional[str] = None, **kwargs):
+    def __init__(self, api_server_url: str, api_key: str | None = None, **kwargs):
         self.api_server_url = api_server_url
         self.chat_completions_v1_url = f'{api_server_url}/v1/chat/completions'
         self.completions_v1_url = f'{api_server_url}/v1/completions'
@@ -66,13 +66,13 @@ def available_models(self):
         return self._available_models
 
     def encode(self,
-               input: Union[str, List[str]],
-               do_preprocess: Optional[bool] = False,
-               add_bos: Optional[bool] = True):
+               input: str | list[str],
+               do_preprocess: bool | None = False,
+               add_bos: bool | None = True):
         """Encode prompts.
 
         Args:
-            input: the prompt to be encoded. In str or List[str] format.
+            input: the prompt to be encoded. In str or list[str] format.
             do_preprocess: whether do preprocess or not. Default to False.
             add_bos: True when it is the beginning of a conversation. False
                 when it is not. Default to True.
@@ -90,28 +90,28 @@ def encode(self,
     def chat_completions_v1(
         self,
         model: str,
-        messages: Union[str, List[Dict[str, str]]],
-        temperature: Optional[float] = 0.7,
-        top_p: Optional[float] = 1.0,
-        logprobs: Optional[bool] = False,
-        top_logprobs: Optional[int] = 0,
-        n: Optional[int] = 1,
-        max_completion_tokens: Optional[int] = None,
-        max_tokens: Optional[int] = None,
-        stop: Optional[Union[str, List[str]]] = None,
-        stream: Optional[bool] = False,
-        presence_penalty: Optional[float] = 0.0,
-        frequency_penalty: Optional[float] = 0.0,
-        user: Optional[str] = None,
-        repetition_penalty: Optional[float] = 1.0,
-        ignore_eos: Optional[bool] = False,
-        skip_special_tokens: Optional[bool] = True,
-        spaces_between_special_tokens: Optional[bool] = True,
+        messages: str | list[dict[str, str]],
+        temperature: float | None = 0.7,
+        top_p: float | None = 1.0,
+        logprobs: bool | None = False,
+        top_logprobs: int | None = 0,
+        n: int | None = 1,
+        max_completion_tokens: int | None = None,
+        max_tokens: int | None = None,
+        stop: str | list[str] | None = None,
+        stream: bool | None = False,
+        presence_penalty: float | None = 0.0,
+        frequency_penalty: float | None = 0.0,
+        user: str | None = None,
+        repetition_penalty: float | None = 1.0,
+        ignore_eos: bool | None = False,
+        skip_special_tokens: bool | None = True,
+        spaces_between_special_tokens: bool | None = True,
         top_k: int = 40,
-        min_new_tokens: Optional[int] = None,
+        min_new_tokens: int | None = None,
         min_p: float = 0.0,
-        logit_bias: Optional[Dict[str, float]] = None,
-        stream_options: Optional[Dict] = None,
+        logit_bias: dict[str, float] | None = None,
+        stream_options: dict | None = None,
         **kwargs,
     ):
         """Chat completion v1.
@@ -130,7 +130,7 @@ def chat_completions_v1(
             max_completion_tokens (int | None): output token nums. Default to None.
             max_tokens (int | None): output token nums. Default to None.
                 Deprecated: Use max_completion_tokens instead.
-            stop (str | List[str] | None): To stop generating further
+            stop (str | list[str] | None): To stop generating further
               tokens. Only accept stop words that's encoded to one token idex.
             repetition_penalty (float): The parameter for repetition penalty.
                 1.0 means no penalty
@@ -148,7 +148,7 @@ def chat_completions_v1(
                 0 and 1. Typical values are in the 0.01-0.2 range, comparably
                 selective as setting `top_p` in the 0.99-0.8 range (use the
                 opposite of normal `top_p` values)
-            logit_bias (Dict): Bias to logits. Only supported in pytorch engine.
+            logit_bias (dict): Bias to logits. Only supported in pytorch engine.
             stream_options: Options for streaming response. Only set this when you
                 set stream: true.
 
@@ -175,23 +175,23 @@ def chat_completions_v1(
     def completions_v1(
         self,
         model: str,
-        prompt: Union[str, List[Any]],
-        suffix: Optional[str] = None,
-        temperature: Optional[float] = 0.7,
-        n: Optional[int] = 1,
-        max_completion_tokens: Optional[int] = 16,
-        max_tokens: Optional[int] = 16,
-        stream: Optional[bool] = False,
-        stop: Optional[Union[str, List[str]]] = None,
-        top_p: Optional[float] = 1.0,
-        top_k: Optional[int] = 40,
-        user: Optional[str] = None,
+        prompt: str | list[Any],
+        suffix: str | None = None,
+        temperature: float | None = 0.7,
+        n: int | None = 1,
+        max_completion_tokens: int | None = 16,
+        max_tokens: int | None = 16,
+        stream: bool | None = False,
+        stop: str | list[str] | None = None,
+        top_p: float | None = 1.0,
+        top_k: int | None = 40,
+        user: str | None = None,
         # additional argument of lmdeploy
-        repetition_penalty: Optional[float] = 1.0,
-        ignore_eos: Optional[bool] = False,
-        skip_special_tokens: Optional[bool] = True,
-        spaces_between_special_tokens: Optional[bool] = True,
-        stream_options: Optional[Dict] = None,
+        repetition_penalty: float | None = 1.0,
+        ignore_eos: bool | None = False,
+        skip_special_tokens: bool | None = True,
+        spaces_between_special_tokens: bool | None = True,
+        stream_options: dict | None = None,
         **kwargs,
     ):
         """Chat completion v1.
@@ -213,7 +213,7 @@ def completions_v1(
             n (int): How many chat completion choices to generate for each
                 input message. Only support one here.
             stream: whether to stream the results or not. Default to false.
-            stop (str | List[str] | None): To stop generating further
+            stop (str | list[str] | None): To stop generating further
               tokens. Only accept stop words that's encoded to one token idex.
             repetition_penalty (float): The parameter for repetition penalty.
                 1.0 means no penalty
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 7b8b2cd9db..6b02fc600f 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -6,10 +6,11 @@
 import os
 import re
 import time
+from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
-from typing import AsyncGenerator, Literal
+from typing import Literal
 
 import uvicorn
 from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request, status
@@ -21,26 +22,58 @@
 from starlette.routing import Mount
 
 from lmdeploy.archs import get_task
-from lmdeploy.messages import (GenerationConfig, LogitsProcessor, PytorchEngineConfig, SpeculativeConfig,
-                               TurbomindEngineConfig)
+from lmdeploy.messages import (
+    GenerationConfig,
+    LogitsProcessor,
+    PytorchEngineConfig,
+    SpeculativeConfig,
+    TurbomindEngineConfig,
+)
 from lmdeploy.metrics.metrics_processor import metrics_processor
 from lmdeploy.model import ChatTemplateConfig
 from lmdeploy.pytorch.disagg.config import DistServeEngineConfig
-from lmdeploy.pytorch.disagg.conn.protocol import (DistServeCacheFreeRequest, DistServeConnectionRequest,
-                                                   DistServeDropConnectionRequest, DistServeInitRequest,
-                                                   MigrationRequest)
+from lmdeploy.pytorch.disagg.conn.protocol import (
+    DistServeCacheFreeRequest,
+    DistServeConnectionRequest,
+    DistServeDropConnectionRequest,
+    DistServeInitRequest,
+    MigrationRequest,
+)
 from lmdeploy.serve.core import AsyncEngine
 from lmdeploy.serve.openai.harmony_utils import GptOssChatParser
-from lmdeploy.serve.openai.protocol import ChatCompletionResponse  # noqa: E501
-from lmdeploy.serve.openai.protocol import (AbortRequest, ChatCompletionRequest, ChatCompletionResponseChoice,
-                                            ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse,
-                                            ChatCompletionTokenLogprob, ChatMessage, ChoiceLogprobs, CompletionRequest,
-                                            CompletionResponse, CompletionResponseChoice,
-                                            CompletionResponseStreamChoice, CompletionStreamResponse, DeltaMessage,
-                                            EmbeddingsRequest, EncodeRequest, EncodeResponse, ErrorResponse,
-                                            GenerateReqInput, GenerateReqMetaOutput, GenerateReqOutput, LogProbs,
-                                            ModelCard, ModelList, ModelPermission, PoolingRequest, PoolingResponse,
-                                            TopLogprob, UpdateParamsRequest, UsageInfo)
+from lmdeploy.serve.openai.protocol import (
+    AbortRequest,
+    ChatCompletionRequest,
+    ChatCompletionResponse,  # noqa: E501
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse,
+    ChatCompletionTokenLogprob,
+    ChatMessage,
+    ChoiceLogprobs,
+    CompletionRequest,
+    CompletionResponse,
+    CompletionResponseChoice,
+    CompletionResponseStreamChoice,
+    CompletionStreamResponse,
+    DeltaMessage,
+    EmbeddingsRequest,
+    EncodeRequest,
+    EncodeResponse,
+    ErrorResponse,
+    GenerateReqInput,
+    GenerateReqMetaOutput,
+    GenerateReqOutput,
+    LogProbs,
+    ModelCard,
+    ModelList,
+    ModelPermission,
+    PoolingRequest,
+    PoolingResponse,
+    TopLogprob,
+    UpdateParamsRequest,
+    UsageInfo,
+)
 from lmdeploy.serve.openai.reasoning_parser.reasoning_parser import ReasoningParser, ReasoningParserManager
 from lmdeploy.serve.openai.tool_parser.tool_parser import ToolParser, ToolParserManager
 from lmdeploy.serve.utils.server_utils import validate_json_request
@@ -159,8 +192,8 @@ def _create_completion_logprobs(tokenizer: Tokenizer,
 
     Args:
         tokenizer (Tokenizer): tokenizer.
-        token_ids (List[int]): output token ids.
-        logprobs (List[Dict[int, float]]): the top logprobs for each output
+        token_ids (list[int]): output token ids.
+        logprobs (list[dict[int, float]]): the top logprobs for each output
             position.
         skip_special_tokens (bool): Whether or not to remove special tokens
             in the decoding. Default to be True.
@@ -213,8 +246,8 @@ def _create_chat_completion_logprobs(tokenizer: Tokenizer,
 
     Args:
         tokenizer (Tokenizer): tokenizer.
-        token_ids (List[int]): output token ids.
-        logprobs (List[Dict[int, float]]): the top logprobs for each output
+        token_ids (list[int]): output token ids.
+        logprobs (list[dict[int, float]]): the top logprobs for each output
             position.
     Returns:
         ChoiceLogprobs: logprob result.
@@ -318,7 +351,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
       Deprecated: Use max_completion_tokens instead.
     - **repetition_penalty** (float): The parameter for repetition penalty.
       1.0 means no penalty
-    - **stop** (str | List[str] | None): To stop generating further
+    - **stop** (str | list[str] | None): To stop generating further
       tokens. Only accept stop words that's encoded to one token idex.
     - **response_format** (dict | None): To generate response according to given
       schema. Examples:
@@ -1413,13 +1446,13 @@ def serve(model_path: str,
         server_name (str): host ip for serving
         server_port (int): server port
         tp (int): tensor parallel
-        allow_origins (List[str]): a list of allowed origins for CORS
+        allow_origins (list[str]): a list of allowed origins for CORS
         allow_credentials (bool): whether to allow credentials for CORS
-        allow_methods (List[str]): a list of allowed HTTP methods for CORS
-        allow_headers (List[str]): a list of allowed HTTP headers for CORS
+        allow_methods (list[str]): a list of allowed HTTP methods for CORS
+        allow_headers (list[str]): a list of allowed HTTP headers for CORS
         log_level(str): set log level whose value among [CRITICAL, ERROR,
             WARNING, INFO, DEBUG]
-        api_keys (List[str] | str | None): Optional list of API keys. Accepts
+        api_keys (list[str] | str | None): Optional list of API keys. Accepts
             string type as a single api_key. Default to None, which means no
             api key applied.
         ssl (bool): Enable SSL. Requires OS Environment variables
diff --git a/lmdeploy/serve/openai/harmony_utils.py b/lmdeploy/serve/openai/harmony_utils.py
index ebd28ebb6f..2810725c0f 100644
--- a/lmdeploy/serve/openai/harmony_utils.py
+++ b/lmdeploy/serve/openai/harmony_utils.py
@@ -1,12 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # Modified from https://github.com/vllm-project/vllm/blob/v0.10.2rc1/vllm/entrypoints/harmony_utils.py
-from typing import List
 
 import shortuuid
 from openai_harmony import HarmonyEncodingName, Role, StreamableParser, load_harmony_encoding
 
-from lmdeploy.serve.openai.protocol import (ChatMessage, DeltaFunctionCall, DeltaMessage, DeltaToolCall, FunctionCall,
-                                            ToolCall)
+from lmdeploy.serve.openai.protocol import (
+    ChatMessage,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    FunctionCall,
+    ToolCall,
+)
 
 _harmony_encoding = None
 
@@ -27,7 +32,7 @@ class GptOssChatParser:
     def __init__(self):
         self.parser = get_streamable_parser_for_assistant()
 
-    def parse_streaming(self, tokens: List[int]) -> DeltaMessage:
+    def parse_streaming(self, tokens: list[int]) -> DeltaMessage:
         parser = self.parser
         delta_message = DeltaMessage(role='assistant')
         content = ''
@@ -76,7 +81,7 @@ def parse_streaming(self, tokens: List[int]) -> DeltaMessage:
         delta_message.tool_calls = tool_calls
         return delta_message
 
-    def parse_full(self, tokens: List[int]) -> ChatMessage:
+    def parse_full(self, tokens: list[int]) -> ChatMessage:
         delta_message = self.parse_streaming(tokens)
         tool_calls = []
         for delta_tool_call in delta_message.tool_calls:
diff --git a/lmdeploy/serve/openai/launch_server.py b/lmdeploy/serve/openai/launch_server.py
index 2d2fd56c3f..011180902d 100644
--- a/lmdeploy/serve/openai/launch_server.py
+++ b/lmdeploy/serve/openai/launch_server.py
@@ -6,7 +6,6 @@
 import signal
 import socket
 import sys
-from typing import List, Union
 
 from lmdeploy.messages import PytorchEngineConfig, TurbomindEngineConfig
 from lmdeploy.utils import get_logger
@@ -16,7 +15,7 @@
 logger = get_logger('lmdeploy')
 
 
-def find_available_ports(num: int) -> List[int]:
+def find_available_ports(num: int) -> list[int]:
     """Find available port."""
 
     def __is_port_ok(port: int):
@@ -47,7 +46,7 @@ def get_host_ip():
         return ip
 
 
-def _run_server(gpu_ids: List[int], model_path: str, **kwargs):
+def _run_server(gpu_ids: list[int], model_path: str, **kwargs):
     """Launch a server process."""
     cuda_visible_devices = ','.join([str(_) for _ in gpu_ids])
     os.setpgrp()
@@ -56,7 +55,7 @@ def _run_server(gpu_ids: List[int], model_path: str, **kwargs):
     serve(model_path, **kwargs)
 
 
-def cleanup_processes(processes: List[mp.Process]):
+def cleanup_processes(processes: list[mp.Process]):
     """Clean up server process."""
     for process in processes:
         logger.info(f'Terminating process group {process.pid}')
@@ -83,7 +82,7 @@ def cleanup_processes(processes: List[mp.Process]):
 def launch_server(num_nodes: int,
                   node_rank: int,
                   model_path: str,
-                  backend_config: Union[PytorchEngineConfig, TurbomindEngineConfig],
+                  backend_config: PytorchEngineConfig | TurbomindEngineConfig,
                   proxy_url: str = None,
                   **kwargs):
     """Run multiple server processes in dp mode."""
diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py
index 5f3d252e7b..b6014b6a79 100644
--- a/lmdeploy/serve/openai/protocol.py
+++ b/lmdeploy/serve/openai/protocol.py
@@ -2,7 +2,7 @@
 # Modified from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
 import time
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Literal
 
 import shortuuid
 from pydantic import BaseModel, ConfigDict, Field
@@ -13,7 +13,7 @@ class ErrorResponse(BaseModel):
     message: str
     type: str
     code: int
-    param: Optional[str] = None
+    param: str | None = None
     object: str = 'error'
 
 
@@ -29,7 +29,7 @@ class ModelPermission(BaseModel):
     allow_view: bool = True
     allow_fine_tuning: bool = False
     organization: str = '*'
-    group: Optional[str] = None
+    group: str | None = None
     is_blocking: bool = False
 
 
@@ -39,29 +39,29 @@ class ModelCard(BaseModel):
     object: str = 'model'
     created: int = Field(default_factory=lambda: int(time.time()))
     owned_by: str = 'lmdeploy'
-    root: Optional[str] = None
-    parent: Optional[str] = None
-    permission: List[ModelPermission] = []
+    root: str | None = None
+    parent: str | None = None
+    permission: list[ModelPermission] = []
 
 
 class ModelList(BaseModel):
     """Model list consists of model cards."""
     object: str = 'list'
-    data: List[ModelCard] = []
+    data: list[ModelCard] = []
 
 
 class UsageInfo(BaseModel):
     """Usage information."""
     prompt_tokens: int = 0
     total_tokens: int = 0
-    completion_tokens: Optional[int] = 0
+    completion_tokens: int | None = 0
 
 
 class Function(BaseModel):
     """Function descriptions."""
-    description: Optional[str] = Field(default=None, examples=[None])
+    description: str | None = Field(default=None, examples=[None])
     name: str
-    parameters: Optional[Dict[str, Any]] = None
+    parameters: dict[str, Any] | None = None
 
 
 class Tool(BaseModel):
@@ -83,82 +83,82 @@ class ToolChoice(BaseModel):
 
 class StreamOptions(BaseModel):
     """The stream options."""
-    include_usage: Optional[bool] = False
+    include_usage: bool | None = False
 
 
 class JsonSchema(BaseModel):
     name: str
     # description is not used since it depends on model
-    description: Optional[str] = None
+    description: str | None = None
     # `schema` is a reserved field in Pydantic BaseModel
     # use alias since pydantic does not support the OpenAI key `schema`
-    json_schema: Optional[Dict[str, Any]] = Field(default=None, alias='schema', examples=[None])
+    json_schema: dict[str, Any] | None = Field(default=None, alias='schema', examples=[None])
     # strict is not used
-    strict: Optional[bool] = False
+    strict: bool | None = False
     model_config = ConfigDict(serialize_by_alias=True)
 
 
 class ResponseFormat(BaseModel):
     # regex_schema is extended by lmdeploy to support regex output
     type: Literal['text', 'json_object', 'json_schema', 'regex_schema']
-    json_schema: Optional[JsonSchema] = None
-    regex_schema: Optional[str] = None
+    json_schema: JsonSchema | None = None
+    regex_schema: str | None = None
 
 
 class ChatCompletionRequest(BaseModel):
     """Chat completion request."""
     model: str
 
-    messages: Union[str, List[Dict[str, Any]]] = Field(examples=[[{'role': 'user', 'content': 'hi'}]])
-    temperature: Optional[float] = 0.7
-    top_p: Optional[float] = 1.0
-    tools: Optional[List[Tool]] = Field(default=None, examples=[None])
-    tool_choice: Union[ToolChoice, Literal['auto', 'required', 'none']] = Field(default='auto', examples=['none'])
-    logprobs: Optional[bool] = False
-    top_logprobs: Optional[int] = None
-    n: Optional[int] = 1
-    logit_bias: Optional[Dict[str, float]] = Field(default=None, examples=[None])
-    max_completion_tokens: Optional[int] = Field(
+    messages: str | list[dict[str, Any]] = Field(examples=[[{'role': 'user', 'content': 'hi'}]])
+    temperature: float | None = 0.7
+    top_p: float | None = 1.0
+    tools: list[Tool] | None = Field(default=None, examples=[None])
+    tool_choice: ToolChoice | Literal['auto', 'required', 'none'] = Field(default='auto', examples=['none'])
+    logprobs: bool | None = False
+    top_logprobs: int | None = None
+    n: int | None = 1
+    logit_bias: dict[str, float] | None = Field(default=None, examples=[None])
+    max_completion_tokens: int | None = Field(
         default=None,
         examples=[None],
         description=('An upper bound for the number of tokens that can be generated for a completion, '
                      'including visible output tokens and reasoning tokens'),
     )
-    max_tokens: Optional[int] = Field(
+    max_tokens: int | None = Field(
         default=None,
         examples=[None],
         deprecated='max_tokens is deprecated in favor of the max_completion_tokens field',
     )
-    stop: Optional[Union[str, List[str]]] = Field(default=None, examples=[None])
-
-    stream: Optional[bool] = False
-    stream_options: Optional[StreamOptions] = Field(default=None, examples=[None])
-    presence_penalty: Optional[float] = 0.0
-    frequency_penalty: Optional[float] = 0.0
-    user: Optional[str] = None
-    reasoning_effort: Optional[Literal['low', 'medium', 'high']] = None
-    response_format: Optional[ResponseFormat] = Field(default=None, examples=[None])
+    stop: str | list[str] | None = Field(default=None, examples=[None])
+
+    stream: bool | None = False
+    stream_options: StreamOptions | None = Field(default=None, examples=[None])
+    presence_penalty: float | None = 0.0
+    frequency_penalty: float | None = 0.0
+    user: str | None = None
+    reasoning_effort: Literal['low', 'medium', 'high'] | None = None
+    response_format: ResponseFormat | None = Field(default=None, examples=[None])
     # additional argument of lmdeploy
-    do_preprocess: Optional[bool] = True
-    repetition_penalty: Optional[float] = 1.0
-    session_id: Optional[int] = -1
-    ignore_eos: Optional[bool] = False
-    skip_special_tokens: Optional[bool] = True
-    spaces_between_special_tokens: Optional[bool] = True
-    top_k: Optional[int] = 40
-    seed: Optional[int] = None
-    min_new_tokens: Optional[int] = Field(default=None, examples=[None])
+    do_preprocess: bool | None = True
+    repetition_penalty: float | None = 1.0
+    session_id: int | None = -1
+    ignore_eos: bool | None = False
+    skip_special_tokens: bool | None = True
+    spaces_between_special_tokens: bool | None = True
+    top_k: int | None = 40
+    seed: int | None = None
+    min_new_tokens: int | None = Field(default=None, examples=[None])
     min_p: float = 0.0
-    enable_thinking: Optional[bool] = None  # will be deprecated in the future
-    return_token_ids: Optional[bool] = False
-    include_stop_str_in_output: Optional[bool] = False
+    enable_thinking: bool | None = None  # will be deprecated in the future
+    return_token_ids: bool | None = False
+    include_stop_str_in_output: bool | None = False
     chat_template_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=('Additional keyword args to pass to the template renderer. '
                      'Will be accessible by the chat template.'),
     )
     # kwargs for hf processor
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=('Additional kwargs to pass to the HF processor'),
     )
@@ -182,51 +182,51 @@ class ExtractedToolCallInformation(BaseModel):
     # indicate if tools were called
     tools_called: bool
     # extracted tool calls
-    tool_calls: List[ToolCall]
+    tool_calls: list[ToolCall]
     # content - per OpenAI spec, content AND tool calls can be returned rarely
     # But some models will do this intentionally
-    content: Optional[str] = None
+    content: str | None = None
 
 
 class ChatMessage(BaseModel):
     """Chat messages."""
     role: str
-    content: Optional[str] = None
-    gen_tokens: Optional[List[int]] = None
-    reasoning_content: Optional[str] = Field(default=None, examples=[None])
-    tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
+    content: str | None = None
+    gen_tokens: list[int] | None = None
+    reasoning_content: str | None = Field(default=None, examples=[None])
+    tool_calls: list[ToolCall] | None = Field(default=None, examples=[None])
 
 
 class LogProbs(BaseModel):
-    text_offset: List[int] = Field(default_factory=list)
-    token_logprobs: List[Optional[float]] = Field(default_factory=list)
-    tokens: List[str] = Field(default_factory=list)
-    top_logprobs: Optional[List[Optional[Dict[str, float]]]] = None
+    text_offset: list[int] = Field(default_factory=list)
+    token_logprobs: list[float | None] = Field(default_factory=list)
+    tokens: list[str] = Field(default_factory=list)
+    top_logprobs: list[dict[str, float] | None] | None = None
 
 
 class TopLogprob(BaseModel):
     token: str
-    bytes: Optional[List[int]] = None
+    bytes: list[int] | None = None
     logprob: float
 
 
 class ChatCompletionTokenLogprob(BaseModel):
     token: str
-    bytes: Optional[List[int]] = None
+    bytes: list[int] | None = None
     logprob: float
-    top_logprobs: List[TopLogprob]
+    top_logprobs: list[TopLogprob]
 
 
 class ChoiceLogprobs(BaseModel):
-    content: Optional[List[ChatCompletionTokenLogprob]] = None
+    content: list[ChatCompletionTokenLogprob] | None = None
 
 
 class ChatCompletionResponseChoice(BaseModel):
     """Chat completion response choices."""
     index: int
     message: ChatMessage
-    logprobs: Optional[ChoiceLogprobs] = None
-    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None
+    logprobs: ChoiceLogprobs | None = None
+    finish_reason: Literal['stop', 'length', 'tool_calls', 'error', 'abort'] | None = None
 
 
 class ChatCompletionResponse(BaseModel):
@@ -235,13 +235,13 @@ class ChatCompletionResponse(BaseModel):
     object: str = 'chat.completion'
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
-    choices: List[ChatCompletionResponseChoice]
+    choices: list[ChatCompletionResponseChoice]
     usage: UsageInfo
 
 
 class DeltaFunctionCall(BaseModel):
-    name: Optional[str] = None
-    arguments: Optional[str] = None
+    name: str | None = None
+    arguments: str | None = None
 
 
 # a tool call delta where everything is optional
@@ -249,24 +249,24 @@ class DeltaToolCall(BaseModel):
     id: str = Field(default_factory=lambda: f'chatcmpl-tool-{shortuuid.random()}')
     type: Literal['function'] = 'function'
     index: int
-    function: Optional[DeltaFunctionCall] = None
+    function: DeltaFunctionCall | None = None
 
 
 class DeltaMessage(BaseModel):
     """Delta messages."""
-    role: Optional[str] = None
-    content: Optional[str] = None
-    reasoning_content: Optional[str] = None
-    gen_tokens: Optional[List[int]] = None
-    tool_calls: List[DeltaToolCall] = Field(default_factory=list)
+    role: str | None = None
+    content: str | None = None
+    reasoning_content: str | None = None
+    gen_tokens: list[int] | None = None
+    tool_calls: list[DeltaToolCall] = Field(default_factory=list)
 
 
 class ChatCompletionResponseStreamChoice(BaseModel):
     """Chat completion response stream choice."""
     index: int
     delta: DeltaMessage
-    logprobs: Optional[ChoiceLogprobs] = None
-    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None
+    logprobs: ChoiceLogprobs | None = None
+    finish_reason: Literal['stop', 'length', 'tool_calls', 'error', 'abort'] | None = None
 
 
 class ChatCompletionStreamResponse(BaseModel):
@@ -275,56 +275,56 @@ class ChatCompletionStreamResponse(BaseModel):
     object: str = 'chat.completion.chunk'
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
-    choices: List[ChatCompletionResponseStreamChoice]
-    usage: Optional[UsageInfo] = None
+    choices: list[ChatCompletionResponseStreamChoice]
+    usage: UsageInfo | None = None
 
 
 class CompletionRequest(BaseModel):
     """Completion request."""
     model: str
-    prompt: Union[str, List[Any]]
-    suffix: Optional[str] = None
-    temperature: Optional[float] = 0.7
-    n: Optional[int] = 1
-    logprobs: Optional[int] = None
-    max_completion_tokens: Optional[int] = Field(
+    prompt: str | list[Any]
+    suffix: str | None = None
+    temperature: float | None = 0.7
+    n: int | None = 1
+    logprobs: int | None = None
+    max_completion_tokens: int | None = Field(
         default=None,
         examples=[None],
         description=('An upper bound for the number of tokens that can be generated for a completion, '
                      'including visible output tokens and reasoning tokens'),
     )
-    max_tokens: Optional[int] = Field(
+    max_tokens: int | None = Field(
         default=16,
         examples=[16],
         deprecated='max_tokens is deprecated in favor of the max_completion_tokens field',
     )
-    stop: Optional[Union[str, List[str]]] = Field(default=None, examples=[None])
-    stream: Optional[bool] = False
-    stream_options: Optional[StreamOptions] = Field(default=None, examples=[None])
-    top_p: Optional[float] = 1.0
-    echo: Optional[bool] = False
-    presence_penalty: Optional[float] = 0.0
-    frequency_penalty: Optional[float] = 0.0
-    user: Optional[str] = None
+    stop: str | list[str] | None = Field(default=None, examples=[None])
+    stream: bool | None = False
+    stream_options: StreamOptions | None = Field(default=None, examples=[None])
+    top_p: float | None = 1.0
+    echo: bool | None = False
+    presence_penalty: float | None = 0.0
+    frequency_penalty: float | None = 0.0
+    user: str | None = None
     # additional argument of lmdeploy
-    repetition_penalty: Optional[float] = 1.0
-    session_id: Optional[int] = -1
-    ignore_eos: Optional[bool] = False
-    skip_special_tokens: Optional[bool] = True
-    spaces_between_special_tokens: Optional[bool] = True
-    top_k: Optional[int] = 40  # for opencompass
-    seed: Optional[int] = None
+    repetition_penalty: float | None = 1.0
+    session_id: int | None = -1
+    ignore_eos: bool | None = False
+    skip_special_tokens: bool | None = True
+    spaces_between_special_tokens: bool | None = True
+    top_k: int | None = 40  # for opencompass
+    seed: int | None = None
     min_p: float = 0.0
-    return_token_ids: Optional[bool] = False
+    return_token_ids: bool | None = False
 
 
 class CompletionResponseChoice(BaseModel):
     """Completion response choices."""
     index: int
     text: str
-    logprobs: Optional[LogProbs] = None
-    gen_tokens: Optional[List[int]] = None
-    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None
+    logprobs: LogProbs | None = None
+    gen_tokens: list[int] | None = None
+    finish_reason: Literal['stop', 'length', 'tool_calls', 'error', 'abort'] | None = None
 
 
 class CompletionResponse(BaseModel):
@@ -333,7 +333,7 @@ class CompletionResponse(BaseModel):
     object: str = 'text_completion'
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
-    choices: List[CompletionResponseChoice]
+    choices: list[CompletionResponseChoice]
     usage: UsageInfo
 
 
@@ -341,9 +341,9 @@ class CompletionResponseStreamChoice(BaseModel):
     """Completion response stream choice."""
     index: int
     text: str
-    logprobs: Optional[LogProbs] = None
-    gen_tokens: Optional[List[int]] = None
-    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None
+    logprobs: LogProbs | None = None
+    gen_tokens: list[int] | None = None
+    finish_reason: Literal['stop', 'length', 'tool_calls', 'error', 'abort'] | None = None
 
 
 class CompletionStreamResponse(BaseModel):
@@ -352,21 +352,21 @@ class CompletionStreamResponse(BaseModel):
     object: str = 'text_completion'
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
-    choices: List[CompletionResponseStreamChoice]
-    usage: Optional[UsageInfo] = None
+    choices: list[CompletionResponseStreamChoice]
+    usage: UsageInfo | None = None
 
 
 class EmbeddingsRequest(BaseModel):
     """Embedding request."""
     model: str = None
-    input: Union[str, List[str]]
-    user: Optional[str] = None
+    input: str | list[str]
+    user: str | None = None
 
 
 class EmbeddingsResponse(BaseModel):
     """Embedding response."""
     object: str = 'list'
-    data: List[Dict[str, Any]]
+    data: list[dict[str, Any]]
     model: str
     usage: UsageInfo
 
@@ -381,11 +381,11 @@ class PoolingRequest(BaseModel):
     https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/protocol.py#L1174
     https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/entrypoints/http_server.py#L383
     """
-    model: Optional[str] = None
-    input: Union[List[int], List[List[int]], str, List[str]]
+    model: str | None = None
+    input: list[int] | list[list[int]] | str | list[str]
     encoding_format: Literal['float', 'base64'] = 'float'
-    dimensions: Optional[int] = None
-    user: Optional[str] = None
+    dimensions: int | None = None
+    user: str | None = None
 
 
 class PoolingResponse(BaseModel):
@@ -394,21 +394,21 @@ class PoolingResponse(BaseModel):
     object: str = 'list'
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str = None
-    data: List[Dict[str, Any]]
+    data: list[dict[str, Any]]
     usage: UsageInfo
 
 
 class EncodeRequest(BaseModel):
     """Encode request."""
-    input: Union[str, List[str]]
-    do_preprocess: Optional[bool] = False
-    add_bos: Optional[bool] = True
+    input: str | list[str]
+    do_preprocess: bool | None = False
+    add_bos: bool | None = True
 
 
 class EncodeResponse(BaseModel):
     """Encode response."""
-    input_ids: Union[List[int], List[List[int]]]
-    length: Union[int, List[int]]
+    input_ids: list[int] | list[list[int]]
+    length: int | list[int]
 
 
 class GenerateResponse(BaseModel):
@@ -417,63 +417,63 @@ class GenerateResponse(BaseModel):
     tokens: int
     input_tokens: int
     history_tokens: int
-    finish_reason: Optional[Literal['stop', 'length', 'tool_calls', 'error', 'abort']] = None
+    finish_reason: Literal['stop', 'length', 'tool_calls', 'error', 'abort'] | None = None
 
 
 class UpdateParamsRequest(BaseModel):
     """Update weights request."""
-    serialized_named_tensors: Union[str, List[str], Dict]
-    load_format: Optional[str] = None  # 'flattened_bucket' or None
+    serialized_named_tensors: str | list[str] | dict
+    load_format: str | None = None  # 'flattened_bucket' or None
     finished: bool = False
 
 
 # str for url/base64, base64 should be data:image/jpeg;base64, dict should be {'url': url/base64, 'options': ...}
-ImageDataInputItem = Union[str, Dict]
-ImageDataFormat = Union[ImageDataInputItem, List[ImageDataInputItem]]
+ImageDataInputItem = str | dict
+ImageDataFormat = ImageDataInputItem | list[ImageDataInputItem]
 
 
 # /generate input
 class GenerateReqInput(BaseModel):
-    session_id: Optional[int] = -1
-    prompt: Optional[str] = None
-    input_ids: Optional[List[int]] = None
-    image_data: Optional[ImageDataFormat] = None
-    return_logprob: Optional[bool] = None
+    session_id: int | None = -1
+    prompt: str | None = None
+    input_ids: list[int] | None = None
+    image_data: ImageDataFormat | None = None
+    return_logprob: bool | None = None
     max_tokens: int = 128
-    stop: Optional[Union[str, List[str]]] = None
-    stop_token_ids: Optional[List[int]] = None
-    stream: Optional[bool] = False
+    stop: str | list[str] | None = None
+    stop_token_ids: list[int] | None = None
+    stream: bool | None = False
     temperature: float = 1.0
-    repetition_penalty: Optional[float] = 1.0
-    ignore_eos: Optional[bool] = False
+    repetition_penalty: float | None = 1.0
+    ignore_eos: bool | None = False
     top_p: float = 1.0
     top_k: int = 0
     min_p: float = 0.0
-    skip_special_tokens: Optional[bool] = True
-    spaces_between_special_tokens: Optional[bool] = True
-    include_stop_str_in_output: Optional[bool] = False
-    return_routed_experts: Optional[bool] = False
+    skip_special_tokens: bool | None = True
+    spaces_between_special_tokens: bool | None = True
+    include_stop_str_in_output: bool | None = False
+    return_routed_experts: bool | None = False
     repetition_ngram_size: int = 0
     repetition_ngram_threshold: int = 0
     # kwargs for hf processor
-    mm_processor_kwargs: Optional[dict[str, Any]] = Field(
+    mm_processor_kwargs: dict[str, Any] | None = Field(
         default=None,
         description=('Additional kwargs to pass to the HF processor'),
     )
 
 
 class GenerateReqMetaOutput(BaseModel):
-    prompt_tokens: Optional[int] = None
-    completion_tokens: Optional[int] = None
-    finish_reason: Optional[Dict[str, Any]] = None
-    output_token_logprobs: Optional[List[tuple[float, int]]] = None  # (logprob, token_id)
-    routed_experts: Optional[Union[List[List[List[int]]], str]] = None  # (num_token, num_layer, topk_expert)
+    prompt_tokens: int | None = None
+    completion_tokens: int | None = None
+    finish_reason: dict[str, Any] | None = None
+    output_token_logprobs: list[tuple[float, int]] | None = None  # (logprob, token_id)
+    routed_experts: list[list[list[int]]] | str | None = None  # (num_token, num_layer, topk_expert)
 
 
 # /generate output
 class GenerateReqOutput(BaseModel):
     text: str
-    output_ids: List[int]
+    output_ids: list[int]
     meta_info: GenerateReqMetaOutput
 
 
@@ -481,7 +481,7 @@ class AbortRequest(BaseModel):
     # Whether to abort all requests
     abort_all: bool = False
     # The finished reason data
-    finished_reason: Optional[Dict[str, Any]] = None
-    abort_message: Optional[str] = None
+    finished_reason: dict[str, Any] | None = None
+    abort_message: str | None = None
     # The session ID to abort. If `abort_all` is True, this field is ignored.
-    session_id: Optional[int] = -1
+    session_id: int | None = -1
diff --git a/lmdeploy/serve/openai/reasoning_parser/deepseek_r1_reasoning_parser.py b/lmdeploy/serve/openai/reasoning_parser/deepseek_r1_reasoning_parser.py
index a6b7e3a602..d2392648e4 100644
--- a/lmdeploy/serve/openai/reasoning_parser/deepseek_r1_reasoning_parser.py
+++ b/lmdeploy/serve/openai/reasoning_parser/deepseek_r1_reasoning_parser.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modified from https://github.com/vllm-project/vllm/tree/v0.7.3/vllm/entrypoints/openai/reasoning_parsers
 import re
-from typing import Optional, Sequence, Tuple, Union
+from collections.abc import Sequence
 
 from lmdeploy.serve.openai.protocol import ChatCompletionRequest, DeltaMessage
 
@@ -42,7 +42,7 @@ def extract_reasoning_content_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         **kwargs,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """Instance method that should be implemented for extracting reasoning
         from an incomplete response; for use when handling reasoning calls and
         streaming.
@@ -105,7 +105,7 @@ def extract_reasoning_content_streaming(
                 return DeltaMessage(reasoning_content=delta_text)
 
     def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest,
-                                  **kwargs) -> Tuple[Optional[str], Optional[str]]:
+                                  **kwargs) -> tuple[str | None, str | None]:
         """Extract reasoning content from a complete model-generated string.
 
         Used for non-streaming responses where we have the entire model response
diff --git a/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py b/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py
index 3d5b792dc1..63f35d76e6 100644
--- a/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py
+++ b/lmdeploy/serve/openai/reasoning_parser/qwen_qwq_reasoning_parser.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import re
-from typing import Optional, Sequence, Tuple, Union
+from collections.abc import Sequence
 
 from lmdeploy.serve.openai.protocol import ChatCompletionRequest, DeltaMessage
 
@@ -35,7 +35,7 @@ def extract_reasoning_content_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         **kwargs,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """Instance method that should be implemented for extracting reasoning
         from an incomplete response; for use when handling reasoning calls and
         streaming.
@@ -95,7 +95,7 @@ def extract_reasoning_content_streaming(
                 return DeltaMessage(reasoning_content=delta_text)
 
     def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest,
-                                  **kwargs) -> Tuple[Optional[str], Optional[str]]:
+                                  **kwargs) -> tuple[str | None, str | None]:
         """Extract reasoning content from a complete model-generated string.
 
         Used for non-streaming responses where we have the entire model response
diff --git a/lmdeploy/serve/openai/reasoning_parser/reasoning_parser.py b/lmdeploy/serve/openai/reasoning_parser/reasoning_parser.py
index f224dba0a5..7abb62069d 100644
--- a/lmdeploy/serve/openai/reasoning_parser/reasoning_parser.py
+++ b/lmdeploy/serve/openai/reasoning_parser/reasoning_parser.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modified from https://github.com/vllm-project/vllm/tree/v0.7.3/vllm/entrypoints/openai/reasoning_parsers
+from collections.abc import Sequence
 from functools import cached_property
-from typing import Dict, Optional, Sequence, Tuple, Union
 
 from mmengine import Registry
 
@@ -16,7 +16,7 @@ def __init__(self, tokenizer: object):
         self.model_tokenizer = tokenizer
 
     @cached_property
-    def vocab(self) -> Dict[str, int]:
+    def vocab(self) -> dict[str, int]:
         # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
         # whereas all tokenizers have .get_vocab()
         return self.model_tokenizer.get_vocab()
@@ -30,7 +30,7 @@ def extract_reasoning_content_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         **kwargs,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """Instance method that should be implemented for extracting reasoning
         from an incomplete response; for use when handling reasoning calls and
         streaming.
@@ -42,7 +42,7 @@ def extract_reasoning_content_streaming(
                                   'has not been implemented!')
 
     def extract_reasoning_content(self, model_output: str, request: ChatCompletionRequest,
-                                  **kwargs) -> Tuple[Optional[str], Optional[str]]:
+                                  **kwargs) -> tuple[str | None, str | None]:
         """Extract reasoning content from a complete model-generated string.
 
         Used for non-streaming responses where we have the entire model response
diff --git a/lmdeploy/serve/openai/tool_parser/internlm2_parser.py b/lmdeploy/serve/openai/tool_parser/internlm2_parser.py
index e104511d76..89e2bb471e 100644
--- a/lmdeploy/serve/openai/tool_parser/internlm2_parser.py
+++ b/lmdeploy/serve/openai/tool_parser/internlm2_parser.py
@@ -1,14 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modified from https://github.com/vllm-project/vllm/tree/v0.7.3/vllm/entrypoints/openai/tool_parsers
 import json
-from typing import Dict, Sequence, Union
+from collections.abc import Sequence
 
 import partial_json_parser
 import shortuuid
 from partial_json_parser.core.options import Allow
 
-from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall,
-                                            ExtractedToolCallInformation, FunctionCall, ToolCall)
+from lmdeploy.serve.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from lmdeploy.utils import get_logger
 
 from .tool_parser import ToolParser, ToolParserManager
@@ -48,7 +55,7 @@ def extract_tool_calls_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         if '<|action_start|>' not in current_text:
             self.position = len(current_text)
             return DeltaMessage(content=delta_text)
@@ -84,7 +91,7 @@ def extract_tool_calls_streaming(
             # tool calls are generated in an object in inernlm2
             # it's not support parallel tool calls
             try:
-                tool_call_arr: Dict = partial_json_parser.loads(parsable_arr, flags)
+                tool_call_arr: dict = partial_json_parser.loads(parsable_arr, flags)
             except partial_json_parser.core.exceptions.MalformedJSON:
                 logger.debug('not enough tokens to parse into JSON yet')
                 return None
diff --git a/lmdeploy/serve/openai/tool_parser/llama3_parser.py b/lmdeploy/serve/openai/tool_parser/llama3_parser.py
index 1c4eaf35d6..445cad312f 100644
--- a/lmdeploy/serve/openai/tool_parser/llama3_parser.py
+++ b/lmdeploy/serve/openai/tool_parser/llama3_parser.py
@@ -1,14 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
 import re
-from typing import Dict, List, Sequence, Union
+from collections.abc import Sequence
 
 import partial_json_parser
 import shortuuid
 from partial_json_parser.core.options import Allow
 
-from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall,
-                                            ExtractedToolCallInformation, FunctionCall, ToolCall)
+from lmdeploy.serve.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from lmdeploy.utils import get_logger
 
 from .tool_parser import ToolParser, ToolParserManager
@@ -30,10 +37,10 @@ def __init__(self, tokenizer: object):
 
         # initialize properties used for state when parsing tool calls in
         # streaming mode
-        self.prev_tool_call_arr: List[Dict] = []
+        self.prev_tool_call_arr: list[dict] = []
         self.current_tool_id: int = -1
         self.current_tool_name_sent: bool = False
-        self.streamed_args_for_tool: List[str] = []  # map what has been streamed for each tool so far to a list
+        self.streamed_args_for_tool: list[str] = []  # map what has been streamed for each tool so far to a list
         self.bot_token = '<|python_tag|>'
         self.bot_token_id = tokenizer.encode(self.bot_token, add_special_tokens=False)[0]
         self.tool_call_regex = re.compile(r'\[{.*?}\]', re.DOTALL)
@@ -48,7 +55,7 @@ def extract_tool_calls(self, model_output: str, request: ChatCompletionRequest)
             name = action.split('<function=')[1].split('>{')[0]
             call_info_list = [(name, parameters)]
 
-            tool_calls: List[ToolCall] = [
+            tool_calls: list[ToolCall] = [
                 ToolCall(type='function', function=FunctionCall(name=name, arguments=arguments))
                 for name, arguments in call_info_list
             ]
@@ -71,7 +78,7 @@ def extract_tool_calls_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
 
         if not (current_text.startswith(self.bot_token) or current_text.startswith('{')):
             return DeltaMessage(content=delta_text)
@@ -105,7 +112,7 @@ def extract_tool_calls_streaming(
                 return None
 
             # select as the current tool call the one we're on the state at
-            current_tool_call: Dict = tool_call_arr[self.current_tool_id] \
+            current_tool_call: dict = tool_call_arr[self.current_tool_id] \
                 if len(tool_call_arr) > 0 else {}
 
             # case -- if no tokens have been streamed for the tool, e.g.
diff --git a/lmdeploy/serve/openai/tool_parser/qwen2d5_parser.py b/lmdeploy/serve/openai/tool_parser/qwen2d5_parser.py
index 9cd68b04e4..eb87d1f97a 100644
--- a/lmdeploy/serve/openai/tool_parser/qwen2d5_parser.py
+++ b/lmdeploy/serve/openai/tool_parser/qwen2d5_parser.py
@@ -1,14 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
 import re
-from typing import Dict, Sequence, Union
+from collections.abc import Sequence
 
 import partial_json_parser
 import shortuuid
 from partial_json_parser.core.options import Allow
 
-from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall,
-                                            ExtractedToolCallInformation, FunctionCall, ToolCall)
+from lmdeploy.serve.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from lmdeploy.utils import get_logger
 
 from .tool_parser import ToolParser, ToolParserManager
@@ -43,7 +50,7 @@ def extract_tool_calls_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         if self.tool_start_token not in current_text:
             self.position = len(current_text)
             return DeltaMessage(content=delta_text)
@@ -79,7 +86,7 @@ def extract_tool_calls_streaming(
             # tool calls are generated in an object in inernlm2
             # it's not support parallel tool calls
             try:
-                tool_call_arr: Dict = partial_json_parser.loads(parsable_arr, flags)
+                tool_call_arr: dict = partial_json_parser.loads(parsable_arr, flags)
             except partial_json_parser.core.exceptions.MalformedJSON:
                 logger.debug('not enough tokens to parse into JSON yet')
                 return None
diff --git a/lmdeploy/serve/openai/tool_parser/qwen3_parser.py b/lmdeploy/serve/openai/tool_parser/qwen3_parser.py
index f1a9635d6c..4b04410461 100644
--- a/lmdeploy/serve/openai/tool_parser/qwen3_parser.py
+++ b/lmdeploy/serve/openai/tool_parser/qwen3_parser.py
@@ -1,13 +1,20 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
 import re
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Dict, Optional, Sequence, Union
 
 import shortuuid
 
-from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall,
-                                            ExtractedToolCallInformation, FunctionCall, ToolCall)
+from lmdeploy.serve.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from lmdeploy.utils import get_logger
 
 from .tool_parser import ToolParser, ToolParserManager
@@ -16,7 +23,7 @@
 
 
 @dataclass
-class ParserState(object):
+class ParserState:
     """Maintains the state of parsing during tool call extraction."""
     position: int = 0  # Current position in the text being parsed
     current_index: int = -1  # Index of the current tool call
@@ -77,14 +84,14 @@ def _split(self, parser_state: ParserState, parsing_content: str):
         parser_state.position += (end_idx - start_idx) + len(self.tool_end_token)
         return parsing_content[:start_idx], parsing_content[start_idx + len(self.tool_start_token):end_idx], True
 
-    def _parse_delta_tool_call(self, parser_state: ParserState, tool_content: str) -> Optional[DeltaToolCall]:
+    def _parse_delta_tool_call(self, parser_state: ParserState, tool_content: str) -> DeltaToolCall | None:
         """Parse tool content into a DeltaToolCall object.
 
         This method handles parsing tool calls only when it's a valid tool
         """
         parsable_arr = tool_content.strip()
         try:
-            tool_call_arr: Dict = json.loads(parsable_arr)
+            tool_call_arr: dict = json.loads(parsable_arr)
         except json.JSONDecodeError:
             logger.debug('cannot parse into JSON yet')
             return
@@ -119,7 +126,7 @@ def extract_tool_calls_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """Extract tool calls from streaming model output.
 
         This method processes incremental model output to extract tool calls, reasoning content, and regular text
diff --git a/lmdeploy/serve/openai/tool_parser/qwen3coder_parser.py b/lmdeploy/serve/openai/tool_parser/qwen3coder_parser.py
index 24ee53c7a8..fad17871fd 100644
--- a/lmdeploy/serve/openai/tool_parser/qwen3coder_parser.py
+++ b/lmdeploy/serve/openai/tool_parser/qwen3coder_parser.py
@@ -1,13 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import json
 import re
+from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Sequence, Tuple, Union
+from typing import Any
 
 import shortuuid
 
-from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall,
-                                            ExtractedToolCallInformation, FunctionCall, ToolCall)
+from lmdeploy.serve.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaFunctionCall,
+    DeltaMessage,
+    DeltaToolCall,
+    ExtractedToolCallInformation,
+    FunctionCall,
+    ToolCall,
+)
 from lmdeploy.utils import get_logger
 
 from .tool_parser import ToolParser, ToolParserManager
@@ -16,7 +24,7 @@
 
 
 @dataclass
-class ParserState(object):
+class ParserState:
     """Maintains the state of parsing during tool call extraction."""
     position: int = 0  # Current position in the text being parsed
     current_index: int = -1  # Index of the current tool call
@@ -48,7 +56,7 @@ def __init__(self, tokenizer: object):
 
         self.tool_call_pat = re.compile(r'\n*<tool_call>(.*?)</tool_call>', re.DOTALL)
 
-    def _split(self, parser_state: ParserState, parsing_content: str) -> Tuple[str, str, bool]:
+    def _split(self, parser_state: ParserState, parsing_content: str) -> tuple[str, str, bool]:
         """Split content into tuple: (text_content, tool_content, has_tool_end)"""
         try:
             start_idx = parsing_content.index(self.tool_start_token)
@@ -66,7 +74,7 @@ def _split(self, parser_state: ParserState, parsing_content: str) -> Tuple[str,
         parser_state.position += rem + len(self.tool_end_token)
         return parsing_content[:start_idx], parsing_content[start_idx:end_idx + len(self.tool_end_token)], True
 
-    def _extract_params(self, content: str) -> Tuple[Optional[str], Dict[str, Any], bool]:
+    def _extract_params(self, content: str) -> tuple[str | None, dict[str, Any], bool]:
         """Parse XML tool content into components."""
         content = content.replace(self.tool_start_token, '').replace(self.tool_end_token, '').strip()
 
@@ -126,7 +134,7 @@ def extract_tool_calls_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
 
         parser_state = getattr(request, '_tool_parser_state', None)
         if parser_state is None:
diff --git a/lmdeploy/serve/openai/tool_parser/tool_parser.py b/lmdeploy/serve/openai/tool_parser/tool_parser.py
index 89ed8091ce..f919d33ef7 100644
--- a/lmdeploy/serve/openai/tool_parser/tool_parser.py
+++ b/lmdeploy/serve/openai/tool_parser/tool_parser.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # modified from https://github.com/vllm-project/vllm/tree/v0.7.3/vllm/entrypoints/openai/tool_parsers
+from collections.abc import Sequence
 from functools import cached_property
-from typing import Dict, List, Sequence, Union
 
 from mmengine import Registry
 
@@ -19,16 +19,16 @@ class ToolParser:
     """
 
     def __init__(self, tokenizer: object):
-        self.prev_tool_call_arr: List[Dict] = []
+        self.prev_tool_call_arr: list[dict] = []
         # the index of the tool call that is currently being parsed
         self.current_tool_id: int = -1
         self.current_tool_name_sent: bool = False
-        self.streamed_args_for_tool: List[str] = []
+        self.streamed_args_for_tool: list[str] = []
 
         self.model_tokenizer = tokenizer
 
     @cached_property
-    def vocab(self) -> Dict[str, int]:
+    def vocab(self) -> dict[str, int]:
         # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab
         # whereas all tokenizers have .get_vocab()
         return self.model_tokenizer.get_vocab()
@@ -55,7 +55,7 @@ def extract_tool_calls_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
         request: ChatCompletionRequest,
-    ) -> Union[DeltaMessage, None]:
+    ) -> DeltaMessage | None:
         """Instance method that should be implemented for extracting tool calls
         from an incomplete response; for use when handling tool calls and
         streaming.
diff --git a/lmdeploy/serve/openai/tool_parser/utils.py b/lmdeploy/serve/openai/tool_parser/utils.py
index a97dc393aa..bee4728d8c 100644
--- a/lmdeploy/serve/openai/tool_parser/utils.py
+++ b/lmdeploy/serve/openai/tool_parser/utils.py
@@ -3,7 +3,7 @@
 
 import json
 from json import JSONDecodeError, JSONDecoder
-from typing import Any, List, Tuple
+from typing import Any
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
@@ -77,7 +77,7 @@ def extract_intermediate_diff(curr: str, old: str) -> str:
     return diff
 
 
-def find_all_indices(string: str, substring: str) -> List[int]:
+def find_all_indices(string: str, substring: str) -> list[int]:
     """Find all (starting) indices of a substring in a given string.
 
     Useful for tool call extraction
@@ -94,7 +94,7 @@ def find_all_indices(string: str, substring: str) -> List[int]:
 
 # partial_json_parser doesn't support extra data and
 # JSONDecorder.raw_decode doesn't support partial JSON
-def partial_json_loads(input_str: str, flags: Allow) -> Tuple[Any, int]:
+def partial_json_loads(input_str: str, flags: Allow) -> tuple[Any, int]:
     try:
         return (partial_json_parser.loads(input_str, flags), len(input_str))
     except JSONDecodeError as e:
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index 2ea599e687..8505448d1b 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
-from typing import Any, Dict, List, Literal, Tuple
+from typing import Any, Literal
 
 import PIL
 
@@ -34,7 +34,7 @@ def __init__(self,
         self.backend = backend
 
     @staticmethod
-    def merge_message_content(msg: Dict) -> Dict:
+    def merge_message_content(msg: dict) -> dict:
         """Merge multimodal content blocks and ensure content field exists.
 
         This function normalizes message content to match vLLM's behavior:
@@ -85,14 +85,14 @@ def merge_message_content(msg: Dict) -> Dict:
         return result
 
     @staticmethod
-    async def async_convert_multimodal_data(messages: List[Dict]) -> List[Dict]:
+    async def async_convert_multimodal_data(messages: list[dict]) -> list[dict]:
         """Convert user-input multimodal data into GPT4V message format."""
         from lmdeploy.vl.time_series_utils import load_time_series
         from lmdeploy.vl.utils import load_image
 
-        if isinstance(messages, Dict):
+        if isinstance(messages, dict):
             messages = [messages]
-        assert isinstance(messages, List)
+        assert isinstance(messages, list)
 
         out_messages = [None] * len(messages)
 
@@ -108,7 +108,7 @@ def _inner_call(i, in_messages, out_messages):
                 return
             # the role is a user and the content is a list, in which there
             # might be image_url or image_data
-            assert isinstance(content, List)
+            assert isinstance(content, list)
             message = dict(role=role, content=[])
             for item in content:
                 # image url or base64-encoded image data
@@ -205,14 +205,14 @@ def _inner_call(i, in_messages, out_messages):
         return out_messages
 
     async def get_prompt_input(self,
-                               prompt: str | List[Dict],
+                               prompt: str | list[dict],
                                do_preprocess: bool,
                                sequence_start: bool,
                                adapter_name: str,
-                               tools: List[object] | None = None,
+                               tools: list[object] | None = None,
                                reasoning_effort: Literal['low', 'medium', 'high'] | None = None,
-                               chat_template_kwargs: Dict | None = None,
-                               mm_processor_kwargs: Dict[str, Any] | None = None,
+                               chat_template_kwargs: dict | None = None,
+                               mm_processor_kwargs: dict[str, Any] | None = None,
                                **kwargs):
         """Process prompt and return prompt string and input_ids.
 
@@ -231,7 +231,7 @@ async def get_prompt_input(self,
             **kwargs: Additional keyword arguments.
 
         Returns:
-            Dict with 'prompt' (str) and 'input_ids' (List[int]) keys for text-only,
+            dict with 'prompt' (str) and 'input_ids' (list[int]) keys for text-only,
             or dict with multimodal data for multimodal prompts.
         """
         # Handle string input
@@ -274,7 +274,7 @@ async def get_prompt_input(self,
             raise RuntimeError(f'unsupported prompt type: {type(prompt)}')
 
     @staticmethod
-    def format_prompts(prompts: Any) -> List[Dict]:
+    def format_prompts(prompts: Any) -> list[dict]:
         """Format prompts."""
         if not isinstance(prompts, list):
             prompts = [prompts]
@@ -318,7 +318,7 @@ def _is_image_list(obj) -> bool:
         return isinstance(obj, list) and all(MultimodalProcessor._is_image(img) for img in obj)
 
     @staticmethod
-    def _re_format_prompt_images_pair(prompt: Tuple) -> Dict:
+    def _re_format_prompt_images_pair(prompt: tuple) -> dict:
         """Reformat the prompt to openai message format."""
         from lmdeploy.vl.utils import load_image
 
@@ -350,7 +350,7 @@ def _re_format_prompt_images_pair(prompt: Tuple) -> Dict:
             messages['content'].append({'type': 'text', 'text': prompt})
         return messages
 
-    def _has_multimodal_input(self, messages: List[Dict]) -> bool:
+    def _has_multimodal_input(self, messages: list[dict]) -> bool:
         """Check if messages contain multimodal input (images)."""
         multimodal_types = ['image_url', 'image_data', 'time_series_url']
         return any(
@@ -358,13 +358,13 @@ def _has_multimodal_input(self, messages: List[Dict]) -> bool:
                 item.get('type') in multimodal_types for item in message['content']) for message in messages)
 
     async def _get_text_prompt_input(self,
-                                     prompt: str | List[Dict],
+                                     prompt: str | list[dict],
                                      do_preprocess: bool,
                                      sequence_start: bool,
                                      adapter_name: str,
-                                     tools: List[object] | None = None,
+                                     tools: list[object] | None = None,
                                      reasoning_effort: Literal['low', 'medium', 'high'] | None = None,
-                                     chat_template_kwargs: Dict | None = None,
+                                     chat_template_kwargs: dict | None = None,
                                      **kwargs):
         """Process text-only prompt and return prompt string and input_ids."""
         # Change multimodal data to openai text messages
@@ -391,13 +391,13 @@ async def _get_text_prompt_input(self,
         return {'prompt': prompt, 'input_ids': input_ids}
 
     async def _get_multimodal_prompt_input(self,
-                                           messages: List[Dict],
+                                           messages: list[dict],
                                            do_preprocess: bool,
                                            sequence_start: bool,
                                            adapter_name: str,
-                                           tools: List[object] | None = None,
-                                           chat_template_kwargs: Dict | None = None,
-                                           mm_processor_kwargs: Dict[str, Any] | None = None,
+                                           tools: list[object] | None = None,
+                                           chat_template_kwargs: dict | None = None,
+                                           mm_processor_kwargs: dict[str, Any] | None = None,
                                            **kwargs):
         """Process multimodal prompt and return processed data for inference
         engines."""
diff --git a/lmdeploy/serve/proxy/proxy.py b/lmdeploy/serve/proxy/proxy.py
index b47a0cd6c2..667886273e 100644
--- a/lmdeploy/serve/proxy/proxy.py
+++ b/lmdeploy/serve/proxy/proxy.py
@@ -10,7 +10,7 @@
 import time
 from collections import deque
 from http import HTTPStatus
-from typing import Deque, Literal
+from typing import Literal
 
 import aiohttp
 import numpy as np
@@ -26,8 +26,13 @@
 from lmdeploy.pytorch.disagg.conn.proxy_conn import PDConnectionPool
 from lmdeploy.pytorch.disagg.messages import PDConnectionMessage
 from lmdeploy.serve.openai.api_server import create_error_response
-from lmdeploy.serve.openai.protocol import ModelCard  # noqa: E501
-from lmdeploy.serve.openai.protocol import ChatCompletionRequest, CompletionRequest, ModelList, ModelPermission
+from lmdeploy.serve.openai.protocol import (
+    ChatCompletionRequest,
+    CompletionRequest,
+    ModelCard,  # noqa: E501
+    ModelList,
+    ModelPermission,
+)
 from lmdeploy.serve.proxy.utils import AIOHTTP_TIMEOUT, LATENCY_DEQUE_LEN, ErrorCodes, RoutingStrategy, err_msg
 from lmdeploy.serve.utils.server_utils import validate_json_request
 from lmdeploy.utils import get_logger
@@ -43,7 +48,7 @@ class Status(BaseModel):
     role: EngineRole = EngineRole.Hybrid
     models: list[str] = Field(default=[], examples=[[]])
     unfinished: int = 0
-    latency: Deque = Field(default=deque(maxlen=LATENCY_DEQUE_LEN), examples=[[]])
+    latency: deque = Field(default=deque(maxlen=LATENCY_DEQUE_LEN), examples=[[]])
     speed: int | None = Field(default=None, examples=[None])
 
 
@@ -96,7 +101,7 @@ def __init__(self,
         if config_path is not None:
             self.config_path = config_path
         if osp.exists(self.config_path) and self.cache_status:
-            with open(self.config_path, 'r') as config_file:
+            with open(self.config_path) as config_file:
                 if os.path.getsize(self.config_path) > 0:
                     logger.info(f'loading node configuration: {self.config_path}')
                     config = json.load(config_file)
@@ -150,7 +155,7 @@ def add(self, node_url: str, status: Status | None = None):
         Args:
             node_url (str): A http url. Can be the url generated by
                 `lmdeploy serve api_server`.
-            description (Dict): The description of the node. An example:
+            description (dict): The description of the node. An example:
                 {'http://0.0.0.0:23333': {models: ['internlm-chat-7b]},
                 speed: -1}. The speed here can be RPM or other metric. All the
                 values of nodes should be the same metric.
@@ -345,7 +350,7 @@ async def stream_generate(self, request: dict, node_url: str, endpoint: str):
         """Return a generator to handle the input request.
 
         Args:
-            request (Dict): the input request.
+            request (dict): the input request.
             node_url (str): the node url.
             endpoint (str): the endpoint. Such as `/v1/chat/completions`.
         """
@@ -364,7 +369,7 @@ async def generate(self, request: dict, node_url: str, endpoint: str):
         """Return a the response of the input request.
 
         Args:
-            request (Dict): the input request.
+            request (dict): the input request.
             node_url (str): the node url.
             endpoint (str): the endpoint. Such as `/v1/chat/completions`.
         """
@@ -490,7 +495,7 @@ def add_node(node: Node, raw_request: Request = None):
 
     - **url** (str): A http url. Can be the url generated by
       `lmdeploy serve api_server`.
-    - **status** (Dict): The description of the node. An example:
+    - **status** (dict): The description of the node. An example:
       ``{models: ['internlm-chat-7b],  speed: 1}``. The speed here can be
       RPM or other metric. All the values of nodes should be the same metric.
     """
@@ -589,9 +594,9 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
       Deprecated: Use max_completion_tokens instead.
     - **repetition_penalty** (float): The parameter for repetition penalty.
       1.0 means no penalty
-    - **stop** (str | List[str] | None): To stop generating further
+    - **stop** (str | list[str] | None): To stop generating further
       tokens. Only accept stop words that's encoded to one token idex.
-    - **response_format** (Dict | None): To generate response according to given
+    - **response_format** (dict | None): To generate response according to given
       schema. Examples:
 
       .. code-block:: json
@@ -612,8 +617,8 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
 
       or
       ``{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}``
-    - **logit_bias** (Dict): Bias to logits. Only supported in pytorch engine.
-    - **tools** (List): A list of tools the model may call. Currently, only
+    - **logit_bias** (dict): Bias to logits. Only supported in pytorch engine.
+    - **tools** (list): A list of tools the model may call. Currently, only
       internlm2 functions are supported as a tool. Use this to specify a
       list of functions for which the model can generate JSON inputs.
     - **tool_choice** (str | object): Controls which (if any) tool is called by
@@ -758,7 +763,7 @@ async def completions_v1(request: CompletionRequest, raw_request: Request = None
     - **repetition_penalty** (float): The parameter for repetition penalty.
       1.0 means no penalty
     - **user** (str): A unique identifier representing your end-user.
-    - **stop** (str | List[str] | None): To stop generating further
+    - **stop** (str | list[str] | None): To stop generating further
       tokens. Only accept stop words that's encoded to one token idex.
 
     Additional arguments supported by LMDeploy:
@@ -898,7 +903,7 @@ def proxy(server_name: str = '0.0.0.0',
         route_strategy ('random' | 'min_expected_latency' | 'min_observed_latency'):
             the strategy to dispatch requests to nodes. Default to
             'min_expected_latency'
-        api_keys (List[str] | str | None): Optional list of API keys. Accepts string type as
+        api_keys (list[str] | str | None): Optional list of API keys. Accepts string type as
             a single api_key. Default to None, which means no api key applied.
         ssl (bool): Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'.
         log_level (str): Set the log level. Default to INFO.
diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py
index 2a4fc407f8..c184e53111 100644
--- a/lmdeploy/tokenizer.py
+++ b/lmdeploy/tokenizer.py
@@ -2,9 +2,9 @@
 import json
 import os.path as osp
 from collections import deque
+from collections.abc import Sequence
 from dataclasses import dataclass
 from functools import partial
-from typing import List, Optional, Sequence, Tuple, Union
 
 from lmdeploy.utils import get_logger
 
@@ -14,24 +14,24 @@
 
 @dataclass
 class DetokenizeState:
-    """A state collection of incrementally detekenization.
+    """A state collection for incremental detokenization.
 
     Args:
-        ids_offset (int): offset to all input ids. In LMDeploy, the output
+        ids_offset: offset to all input ids. In LMDeploy, the output
             ids length is not one by one. It could be random by random.
-        prev_tokens (List[str] | None): for incrementally decoding.
+        prev_tokens: for incrementally decoding.
             Default to None, which means the first round.
-        prefix_offset (int): the start index of tokens to be converted to
+        prefix_offset: the start index of tokens to be converted to
             string (prev + new tokens). Default to 0 for the first round.
-        read_offset (int): the end index of tokens to be converted to
+        read_offset: the end index of tokens to be converted to
             string (prev token). Default to 0 for the first round.
     """
     ids_offset: int = 0
-    prev_tokens: Optional[List[str]] = None
+    prev_tokens: list[str] | None = None
     prefix_offset: int = 0
     read_offset: int = 0
 
-    def as_tuple(self) -> Tuple:
+    def as_tuple(self) -> tuple:
         """Return a tuple of states."""
         return (self.ids_offset, self.prev_tokens, self.prefix_offset, self.read_offset)
 
@@ -40,7 +40,7 @@ class HuggingFaceTokenizer:
     """A wrapper of transformers' AutoTokenizer.
 
     Args:
-        model_dir (str): the directory of the tokenizer model
+        model_dir: the directory of the tokenizer model.
     """
 
     def __init__(self, model_dir: str):
@@ -53,7 +53,7 @@ def __init__(self, model_dir: str):
         if self.model.eos_token_id is None:
             generation_config_file = osp.join(model_dir, 'generation_config.json')
             if osp.exists(generation_config_file):
-                with open(generation_config_file, 'r') as f:
+                with open(generation_config_file) as f:
                     cfg = json.load(f)
                     self.model.eos_token_id = cfg['eos_token_id']
             elif hasattr(self.model, 'eod_id'):  # Qwen remote
@@ -129,7 +129,7 @@ def prefix_space_tokens(self):
             }
         return self._prefix_space_tokens
 
-    def _maybe_add_prefix_space(self, tokens: List[int], decoded: str):
+    def _maybe_add_prefix_space(self, tokens: list[int], decoded: str):
         """Maybe add prefix space for incremental decoding."""
         if len(tokens) and not decoded.startswith(' ') and\
                 tokens[0] in self.prefix_space_tokens:
@@ -193,13 +193,13 @@ def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True,
         """Tokenize a prompt.
 
         Args:
-            s (str): a prompt
-            add_bos (bool): Whether to add `bos` token id when encoding
-                the prompt
-            add_special_tokens (bool): Whether or not to add special tokens
-                when encoding the prompt
+            s: a prompt.
+            add_bos: Whether to add ``bos`` token id when encoding the prompt.
+            add_special_tokens: Whether or not to add special tokens
+                when encoding the prompt.
+
         Returns:
-            list[int]: token ids
+            list[int]: token ids.
         """
         encoded = self.model.encode(s, add_special_tokens=add_special_tokens, **kwargs)
         if not add_bos:
@@ -208,17 +208,18 @@ def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True,
                 encoded = encoded[1:]
         return encoded
 
-    def decode(self, t: Sequence[int], offset: Optional[int] = None, skip_special_tokens: bool = True):
+    def decode(self, t: Sequence[int], offset: int | None = None, skip_special_tokens: bool = True):
         """De-tokenize.
 
         Args:
-            t (List[int]): a list of token ids
-            offset (int): for incrementally decoding. Default to None, which
+            t: a list of token ids.
+            offset: for incrementally decoding. Default to None, which
                 means not applied.
-            skip_special_tokens (bool): Whether or not to remove special
+            skip_special_tokens: Whether or not to remove special
                 tokens in the decoding.
+
         Returns:
-            str: text of decoding tokens
+            str: text of decoding tokens.
         """
         t = t[offset:]
         out_string = self.model.decode(t, skip_special_tokens=skip_special_tokens)
@@ -232,7 +233,7 @@ def decode(self, t: Sequence[int], offset: Optional[int] = None, skip_special_to
     @staticmethod
     def _convert_tokens_to_string_with_added_encoders(
         tokenizer,
-        output_tokens: List[str],
+        output_tokens: list[str],
         skip_special_tokens: bool,
         spaces_between_special_tokens: bool,
     ) -> str:
@@ -272,18 +273,18 @@ def detokenize_incrementally(self,
         """Incrementally detokenize the input indexes.
 
         Args:
-            all_input_ids (List[int]): a list of token ids. Expected to be
+            all_input_ids: a list of token ids. Expected to be
                 different sections of a long sequence.
-            state (DetokenizeState): an instance of DetokenizeState. Consists
+            state: an instance of DetokenizeState. Consists
                 of incrementally decoding states.
-            skip_special_tokens (bool): Whether or not to remove special tokens
+            skip_special_tokens: Whether or not to remove special tokens
                 in the decoding. Default to be True.
-            spaces_between_special_tokens (bool): Whether or not to add spaces
+            spaces_between_special_tokens: Whether or not to add spaces
                 between special tokens. Default to be True.
+
         Returns:
-            str: decoding output string of the current round.
-            state (DetokenizeState): an instance of DetokenizeState. Consists
-                of incrementally decoding states.
+            tuple[str, DetokenizeState]: decoding output string of the current
+                round and the updated DetokenizeState.
         """
         tokenizer = self.model
         ids_offset, prev_tokens, prefix_offset, read_offset = state.as_tuple()
@@ -335,13 +336,14 @@ def detokenize_incrementally(self,
 
         return new_text, DetokenizeState(len(all_input_ids), prev_tokens, prefix_offset, read_offset)
 
-    def __call__(self, s: Union[str, Sequence[str]]):
+    def __call__(self, s: str | Sequence[str]):
         """Tokenize prompts.
 
         Args:
-            s (str): prompts
+            s: prompts.
+
         Returns:
-            list[int]: token ids
+            list[int]: token ids.
         """
         add_special_tokens = False
         return self.model(s, add_special_tokens=add_special_tokens)
@@ -351,7 +353,7 @@ class ChatGLM4Tokenizer(HuggingFaceTokenizer):
     """Tokenizer of GLM4."""
 
     def __init__(self, model_path):
-        super(ChatGLM4Tokenizer, self).__init__(model_path)
+        super().__init__(model_path)
         original_pad = self.model._pad
 
         def __pad(*args, **kwargs):
@@ -366,14 +368,14 @@ def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True,
         """Tokenize a prompt."""
         # ChtGLM4Tokenizer hardcode `add_speical_tokens=False` when tokenizing
         # a prompt. Refer to https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/tokenization_chatglm.py#L227 # noqa E501
-        return super(ChatGLM4Tokenizer, self).encode(s, add_bos, add_special_tokens=False, **kwargs)
+        return super().encode(s, add_bos, add_special_tokens=False, **kwargs)
 
 
 class ChatGLMTokenizer(HuggingFaceTokenizer):
     """Tokenizer of GLM2."""
 
     def __init__(self, model_path):
-        super(ChatGLMTokenizer, self).__init__(model_path)
+        super().__init__(model_path)
         original_pad = self.model._pad
 
         def __pad(*args, **kwargs):
@@ -389,7 +391,7 @@ class GptOssTokenizer(HuggingFaceTokenizer):
     """Tokenizer of GPT-OSS."""
 
     def __init__(self, model_dir: str):
-        super(GptOssTokenizer, self).__init__(model_dir)
+        super().__init__(model_dir)
         from openai_harmony import HarmonyEncodingName, Role, StreamableParser, load_harmony_encoding
         encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
         self.role = Role.ASSISTANT
@@ -418,7 +420,7 @@ class Tokenizer:
     """Tokenize prompts or de-tokenize tokens into texts.
 
     Args:
-        model_path (str): the path of the tokenizer model
+        model_path: the path of the tokenizer model.
     """
 
     def __init__(self, model_path: str):
@@ -464,13 +466,13 @@ def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True,
         """Tokenize a prompt.
 
         Args:
-            s (str): a prompt
-            add_bos (bool): Whether to add `bos` token id when encoding
-                the prompt
-            add_special_tokens (bool): Whether or not to add special tokens
-                when encoding the prompt
+            s: a prompt.
+            add_bos: Whether to add ``bos`` token id when encoding the prompt.
+            add_special_tokens: Whether or not to add special tokens
+                when encoding the prompt.
+
         Returns:
-            list[int]: token ids
+            list[int]: token ids.
         """
         encoded = self.model.encode(s, add_bos, add_special_tokens, **kwargs)
         if encoded[:2] == [self.bos_token_id] * 2:
@@ -483,19 +485,20 @@ def encode(self, s: str, add_bos: bool = True, add_special_tokens: bool = True,
     def decode(
         self,
         t: Sequence[int],
-        offset: Optional[int] = None,
+        offset: int | None = None,
         skip_special_tokens: bool = True,
     ):
         """De-tokenize.
 
         Args:
-            t (List[int]): a list of token ids
-            offset (int): for incrementally decoding. Default to None, which
+            t: a list of token ids.
+            offset: for incrementally decoding. Default to None, which
                 means not applied.
-            skip_special_tokens (bool): Whether or not to remove special
+            skip_special_tokens: Whether or not to remove special
                 tokens in the decoding.
+
         Returns:
-            str: text of decoding tokens
+            str: text of decoding tokens.
         """
         return self.model.decode(t, offset, skip_special_tokens)
 
@@ -507,31 +510,32 @@ def detokenize_incrementally(self,
         """Incrementally detokenize the input indexes.
 
         Args:
-            all_input_ids (List[int]): a list of token ids. Expected to be
+            all_input_ids: a list of token ids. Expected to be
                 different sections of a long sequence.
-            state (DetokenizeState): an instance of DetokenizeState. Consists
+            state: an instance of DetokenizeState. Consists
                 of incrementally decoding states.
-            skip_special_tokens (bool): Whether or not to remove special tokens
+            skip_special_tokens: Whether or not to remove special tokens
                 in the decoding. Default to be True.
-            spaces_between_special_tokens (bool): Whether or not to add spaces
+            spaces_between_special_tokens: Whether or not to add spaces
                 between special tokens. Default to be True.
+
         Returns:
-            str: decoding output string of the current round.
-            state (DetokenizeState): an instance of DetokenizeState. Consists
-                of incrementally decoding states.
+            tuple[str, DetokenizeState]: decoding output string of the current
+                round and the updated DetokenizeState.
         """
         return self.model.detokenize_incrementally(all_input_ids,
                                                    state=state,
                                                    skip_special_tokens=skip_special_tokens,
                                                    spaces_between_special_tokens=spaces_between_special_tokens)
 
-    def __call__(self, s: Union[str, Sequence[str]]):
+    def __call__(self, s: str | Sequence[str]):
         """Tokenize prompts.
 
         Args:
-            s (str): prompts
+            s: prompts.
+
         Returns:
-            list[int]: token ids
+            list[int]: token ids.
         """
         return self.model(s)
 
diff --git a/lmdeploy/turbomind/__init__.py b/lmdeploy/turbomind/__init__.py
index 318f15dfc2..177274aff9 100644
--- a/lmdeploy/turbomind/__init__.py
+++ b/lmdeploy/turbomind/__init__.py
@@ -3,20 +3,18 @@
 
 def bootstrap():
     import os
-    import sys
 
     has_turbomind = False
     pwd = os.path.dirname(__file__)
     if os.path.exists(os.path.join(pwd, '..', 'lib')):
         has_turbomind = True
     if os.name == 'nt' and has_turbomind:
-        if sys.version_info[:2] >= (3, 8):
-            CUDA_PATH = os.getenv('CUDA_PATH')
-            assert CUDA_PATH is not None, 'Can not find $env:CUDA_PATH'
-            dll_path = os.path.join(CUDA_PATH, 'bin')
-            print(f'Add dll path {dll_path}, please note cuda version '
-                  'should >= 11.3 when compiled with cuda 11')
-            os.add_dll_directory(dll_path)
+        CUDA_PATH = os.getenv('CUDA_PATH')
+        assert CUDA_PATH is not None, 'Can not find $env:CUDA_PATH'
+        dll_path = os.path.join(CUDA_PATH, 'bin')
+        print(f'Add dll path {dll_path}, please note cuda version '
+              'should >= 11.3 when compiled with cuda 11')
+        os.add_dll_directory(dll_path)
 
 
 bootstrap()
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
index c243554d7d..8fdb95ac78 100644
--- a/lmdeploy/turbomind/deploy/config.py
+++ b/lmdeploy/turbomind/deploy/config.py
@@ -2,7 +2,6 @@
 import inspect
 import json
 from dataclasses import asdict, field, fields
-from typing import List
 
 # use pydantic.dataclasses.dataclass to check data type
 from pydantic.dataclasses import dataclass
@@ -55,11 +54,11 @@ class ModelConfig:
     # of token_embedding
     embedding_size: int = 0
     num_layer: int = None
-    inter_size: List[int] = None
+    inter_size: list[int] = None
     norm_eps: float = None
     attn_bias: int = 0
     mlp_bias: bool = False
-    window_size: List[int] = field(default_factory=list)
+    window_size: list[int] = field(default_factory=list)
     attn_sink: bool = False
     qk_norm: bool = False
     size_per_head: int = 128
@@ -73,7 +72,7 @@ class ModelConfig:
     attn_cp_size: int = 1
     mlp_tp_size: int = 1
     model_format: str = 'hf'
-    expert_num: List[int] = ()
+    expert_num: list[int] = field(default_factory=list)
     expert_router_bias: bool = False
     expert_inter_size: int = 0
     experts_per_token: int = 0
@@ -92,7 +91,7 @@ class ModelConfig:
     qk_rope_dim: int = 0
     v_head_dim: int = 0
     # Qwen 3.5
-    layer_types: List[str] = field(default_factory=list)
+    layer_types: list[str] = field(default_factory=list)
     linear_key_head_dim: int = 0
     linear_value_head_dim: int = 0
     linear_conv_kernel_dim: int = 0
@@ -102,7 +101,7 @@ class ModelConfig:
     # Per-layer expert weight type override: layer indices whose
     # MoE experts are unquantized (fp16) despite expert_weight_type=int4.
     # Populated from modules_to_not_convert patterns like 'model.layers.0.'.
-    unquantized_expert_layers: List[int] = field(default_factory=list)
+    unquantized_expert_layers: list[int] = field(default_factory=list)
     # tuning
     tune_layer_num: int = 1
 
@@ -127,7 +126,7 @@ class RopeParam:
     low_freq_factor: float = None
     high_freq_factor: float = None
     original_max_position_embeddings: int = None
-    mrope_section: List[int] = None
+    mrope_section: list[int] = None
 
 
 @dataclass
diff --git a/lmdeploy/turbomind/deploy/loader.py b/lmdeploy/turbomind/deploy/loader.py
index 002c938e27..2475a8a928 100644
--- a/lmdeploy/turbomind/deploy/loader.py
+++ b/lmdeploy/turbomind/deploy/loader.py
@@ -4,10 +4,10 @@
 import re
 from abc import ABC, abstractmethod
 from collections import defaultdict
+from collections.abc import Iterator
 from functools import partial
 from glob import glob
 from queue import Queue
-from typing import Iterator, Tuple, Union
 
 import torch
 from safetensors import safe_open
@@ -29,12 +29,12 @@ def __init__(self, model_path: str, pattern, mappings: list):
         self.item_count = defaultdict(int)
         self.mappings = mappings
 
-    def get_index(self, index_name: str, file_pattern: str) -> Tuple[dict, list]:
+    def get_index(self, index_name: str, file_pattern: str) -> tuple[dict, list]:
         """Get shards and weight map (if possible) for the model."""
         get_path = partial(osp.join, self.model_path)
         shards = []
         if index_name:
-            with open(get_path(index_name), 'r') as f:
+            with open(get_path(index_name)) as f:
                 index = json.load(f)
             index = index['weight_map']
             shards = list(map(get_path, set(index.values())))
@@ -55,7 +55,7 @@ def map_key(self, key: str):
             return key
 
     @abstractmethod
-    def items(self) -> Iterator[Tuple[int, dict]]:
+    def items(self) -> Iterator[tuple[int, dict]]:
         pass
 
 
@@ -174,7 +174,7 @@ def items(self):
             self.que.task_done()
 
 
-def create_loader(model_path: Union[str, Queue], pattern: str, mappings: list) -> BaseLoader:
+def create_loader(model_path: str | Queue, pattern: str, mappings: list) -> BaseLoader:
     args = (model_path, pattern, mappings)
 
     if isinstance(model_path, Queue):
diff --git a/lmdeploy/turbomind/deploy/policy.py b/lmdeploy/turbomind/deploy/policy.py
index 4082df36d3..0e4c061c0d 100644
--- a/lmdeploy/turbomind/deploy/policy.py
+++ b/lmdeploy/turbomind/deploy/policy.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
 
 import torch.cuda
 
@@ -8,7 +7,7 @@ def to_cuda(x: torch.Tensor, *args):
     return x.cuda()
 
 
-def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> List[torch.Tensor]:
+def get_u4_slices(x: torch.Tensor, dtype: torch.dtype) -> list[torch.Tensor]:
     MAP = {torch.int32: 8, torch.uint8: 2}
     xs = []
     for _ in range(MAP[x.dtype]):
diff --git a/lmdeploy/turbomind/deploy/source_model/base.py b/lmdeploy/turbomind/deploy/source_model/base.py
index 1de0e54b76..9bc6ca3bbc 100644
--- a/lmdeploy/turbomind/deploy/source_model/base.py
+++ b/lmdeploy/turbomind/deploy/source_model/base.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
-from typing import Dict, Iterator, Union
+from collections.abc import Iterator
 
 import torch
 from mmengine import Registry
@@ -14,7 +14,7 @@ class BaseReader(ABC):
     def __init__(self):
         pass
 
-    def transform(self, x: Union[torch.Tensor, None], kind: str) -> Union[torch.Tensor, None]:
+    def transform(self, x: torch.Tensor | None, kind: str) -> torch.Tensor | None:
         return None if x is None else self._transform(x, kind)
 
     @abstractmethod
@@ -37,7 +37,7 @@ def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
         self.tokenizer_path = tokenizer_path
 
     @abstractmethod
-    def model_info(self) -> Dict:
+    def model_info(self) -> dict:
         """Read model info."""
         pass
 
diff --git a/lmdeploy/turbomind/tokenizer_info.py b/lmdeploy/turbomind/tokenizer_info.py
index 7b107f3904..d03eca023b 100644
--- a/lmdeploy/turbomind/tokenizer_info.py
+++ b/lmdeploy/turbomind/tokenizer_info.py
@@ -6,7 +6,6 @@
 import json
 import logging
 from enum import Enum
-from typing import List, Optional, Union
 
 import _xgrammar as _xgr  # noqa: E402
 
@@ -71,27 +70,27 @@ class TokenizerInfo(_xgr.TokenizerInfo):
 
     def __init__(
         self,
-        encoded_vocab: Union[List[bytes], List[str]],
+        encoded_vocab: list[bytes] | list[str],
         vocab_type: VocabType = VocabType.RAW,
         *,
-        vocab_size: Optional[int] = None,
-        stop_token_ids: Optional[Union[List[int], int]] = None,
+        vocab_size: int | None = None,
+        stop_token_ids: list[int] | int | None = None,
         add_prefix_space: bool = False,
     ) -> None:
         """Construct the tokenizer info.
 
         Parameters
         ----------
-        encoded_vocab : Union[List[bytes], List[str]]
+        encoded_vocab : list[bytes] | list[str]
             The encoded vocabulary of the tokenizer.
 
         vocab_type : VocabType, default: VocabType.RAW
             The type of the vocabulary. See also VocabType.
 
-        vocab_size : Optional[int], default: None
+        vocab_size : int | None, default: None
             The size of the vocabulary. If not provided, the vocabulary size will be len(encoded_vocab).
 
-        stop_token_ids : Optional[List[int]], default: None
+        stop_token_ids : list[int] | None, default: None
             The stop token ids. If not provided, the stop token ids will be auto detected (but may not
             be correct).
 
@@ -134,8 +133,8 @@ def _is_sentencepiece_tokenizer(tokenizer: PreTrainedTokenizerBase) -> bool:
     def from_huggingface(
         tokenizer: PreTrainedTokenizerBase,
         *,
-        vocab_size: Optional[int] = None,
-        stop_token_ids: Optional[Union[List[int], int]] = None,
+        vocab_size: int | None = None,
+        stop_token_ids: list[int] | int | None = None,
     ) -> 'TokenizerInfo':
         """Construct the tokenizer info from the huggingface tokenizer. This
         constructor supports various tokenizer backends, including the
@@ -154,7 +153,7 @@ def from_huggingface(
         tokenizer : PreTrainedTokenizerBase
             The huggingface tokenizer.
 
-        vocab_size : Optional[int], default: None
+        vocab_size : int | None, default: None
             The vocabulary size **defined by the model** (**not the tokenizer**). This equals to the
             vocab dimension of the model's lm_head. This is the size of the token mask.
 
@@ -172,7 +171,7 @@ def from_huggingface(
             model_vocab_size need to be provided for case 2 and 3. If not provided, it will be
             set to the tokenizer's vocabulary size.
 
-        stop_token_ids : Optional[List[int]], default: None
+        stop_token_ids : list[int] | None, default: None
             The stop token ids. If not provided, the eos_token_id of the tokenizer will be used.
 
         Returns
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
index a4a37dc529..f95b2b93ca 100644
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -13,7 +13,7 @@
 from functools import partial
 from multiprocessing.reduction import ForkingPickler
 from queue import Queue
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import pybase64
 import torch
@@ -41,7 +41,7 @@
 MAX_LOGPROBS = 1024
 
 
-def _construct_stop_or_bad_words(words: List[int] = None):
+def _construct_stop_or_bad_words(words: list[int] = None):
     if words is None or len(words) == 0:
         return None
     offsets = list(range(1, len(words) + 1))
@@ -291,7 +291,7 @@ def sleep(self, level: int = 1):
             for _ in e.map(self.model_comm.sleep, range(self.gpu_count), [level] * self.gpu_count):
                 pass
 
-    def wakeup(self, tags: Optional[list[str]] = None):
+    def wakeup(self, tags: list[str] | None = None):
         """Wakeup the model."""
         if tags is None:
             tags = ['weights', 'kv_cache']
@@ -311,7 +311,7 @@ def update_params(self, request: UpdateParamsRequest):
         def _construct(item):
             """ Deserialize torch.Tensor
             Args:
-                item (Tuple[Callable, Tuple]): the return of reduce_tensor
+                item (tuple[Callable, tuple]): the return of reduce_tensor
             """
             func, args = item
             args = list(args)
@@ -424,7 +424,7 @@ def _func(out: EngineOutput, step: int, **kwargs):
 
 
 def _get_logprobs_impl(logprob_vals: torch.Tensor, logprob_idxs: torch.Tensor, logprob_nums: torch.Tensor,
-                       output_ids: List[int], logprobs: int, offset: int):
+                       output_ids: list[int], logprobs: int, offset: int):
     """Get logprob of each generated token.
 
     Args:
@@ -432,7 +432,7 @@ def _get_logprobs_impl(logprob_vals: torch.Tensor, logprob_idxs: torch.Tensor, l
             1024 is the max_logprobs that turbomind engine can output
         logprob_idxs (torch.Tensor): shape (max_new_tokens, 1024)
         logprob_nums (torch.Tensor): shape (max_new_tokens,)
-        output_ids (List[int]): new generated token ids
+        output_ids (list[int]): new generated token ids
         logprobs (int): top n logprobs to return
         offset (int): offset to index logprob_vals, logprob_idxs and logprob_nums.
             It indicates where to start getting logprobs for the current generated tokens `output_ids`
@@ -562,7 +562,7 @@ def _create_model_instance(self):
         model_inst = self.tm_model.model_comm.create_request()
         return model_inst
 
-    def _get_extra_output_processors(self, outputs: Dict[str, torch.Tensor], gen_config: GenerationConfig,
+    def _get_extra_output_processors(self, outputs: dict[str, torch.Tensor], gen_config: GenerationConfig,
                                      input_len: int, metrics: '_tm.RequestMetrics'):
 
         def _get_offset(type):
@@ -586,8 +586,8 @@ def prepare_embeddings(self, input_embeddings=None, input_embedding_ranges=None)
         if not input_embeddings:
             return None, None
 
-        assert isinstance(input_embeddings, List)
-        assert isinstance(input_embedding_ranges, List)
+        assert isinstance(input_embeddings, list)
+        assert isinstance(input_embedding_ranges, list)
         assert len(input_embeddings) == len(input_embedding_ranges)
 
         length = sum([x.shape[0] for x in input_embeddings])
@@ -605,7 +605,7 @@ def prepare_embeddings(self, input_embeddings=None, input_embedding_ranges=None)
 
         return values, ranges
 
-    def prepare_mrope(self, input_meta: Dict[str, Any], input_len: int):
+    def prepare_mrope(self, input_meta: dict[str, Any], input_len: int):
         mrope_position_ids = input_meta['mrope_position_ids']
         mrope_position_delta = input_meta['mrope_position_delta']
         assert mrope_position_ids.size(-1) == input_len
@@ -617,7 +617,7 @@ def prepare_inputs(self,
                        gen_config: GenerationConfig,
                        input_embeddings=None,
                        input_embedding_ranges=None,
-                       input_meta: Dict[str, Any] = None):
+                       input_meta: dict[str, Any] = None):
         """Convert inputs format."""
         assert isinstance(input_ids, Sequence)
 
@@ -661,7 +661,7 @@ async def async_stream_infer(self,
                                  input_ids,
                                  input_embeddings=None,
                                  input_embedding_ranges=None,
-                                 input_meta: Dict[str, Any] = None,
+                                 input_meta: dict[str, Any] = None,
                                  sequence_start: bool = True,
                                  sequence_end: bool = False,
                                  step=0,
@@ -673,8 +673,8 @@ async def async_stream_infer(self,
         Args:
             session_id (int): the id of a session
             input_ids (numpy.ndarray): the token ids of a prompt
-            input_embeddings (List[numpy.ndarray]): embeddings features
-            input_embedding_ranges (List[Tuple[int,int]]): the begin/end
+            input_embeddings (list[numpy.ndarray]): embeddings features
+            input_embedding_ranges (list[tuple[int,int]]): the begin/end
               offsets of input_embeddings to input_ids
             sequence_start (bool): indicator for starting a sequence
             sequence_end (bool): indicator for ending a sequence
diff --git a/lmdeploy/version.py b/lmdeploy/version.py
index 54a2e46ee5..a5f42d62cc 100644
--- a/lmdeploy/version.py
+++ b/lmdeploy/version.py
@@ -1,11 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
 
 __version__ = '0.12.1'
 short_version = __version__
 
 
-def parse_version_info(version_str: str) -> Tuple:
+def parse_version_info(version_str: str) -> tuple:
     """Parse version from a string.
 
     Args:
diff --git a/lmdeploy/vl/engine.py b/lmdeploy/vl/engine.py
index a4c926b1d7..8cd179df8a 100644
--- a/lmdeploy/vl/engine.py
+++ b/lmdeploy/vl/engine.py
@@ -3,7 +3,7 @@
 import asyncio
 import inspect
 from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Dict, List, Optional, Union
+from typing import Any
 
 import torch
 
@@ -37,7 +37,7 @@ def __init__(
         model_path: str,
         backend: str,
         vision_config: VisionConfig = None,
-        backend_config: Optional[Union[TurbomindEngineConfig, PytorchEngineConfig]] = None,
+        backend_config: TurbomindEngineConfig | PytorchEngineConfig | None = None,
     ):
         self.model = load_vl_model(model_path, backend, backend_config=backend_config)
         if vision_config is None:
@@ -48,8 +48,8 @@ def __init__(
         torch.cuda.empty_cache()
 
     async def preprocess(self,
-                         messages: List[Dict],
-                         mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]:
+                         messages: list[dict],
+                         mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
         """Preprocess multimodal data in the messages."""
         if _accepts_arg(self.model.preprocess, 'mm_processor_kwargs'):
             future = asyncio.get_event_loop().run_in_executor(self.executor, self.model.preprocess, messages,
@@ -60,11 +60,11 @@ async def preprocess(self,
         outputs = await future
         return outputs
 
-    async def async_infer(self, messages: List[Dict]) -> List[Dict]:
+    async def async_infer(self, messages: list[dict]) -> list[dict]:
         """Get multimodal embedding.
 
         Args:
-            messages (List[Dict]): a list of message, which is the output
+            messages (list[dict]): a list of message, which is the output
             of `preprocess()`
         """
         future = asyncio.get_event_loop().run_in_executor(self.executor, self.model.forward, messages,
@@ -75,28 +75,30 @@ async def async_infer(self, messages: List[Dict]) -> List[Dict]:
 
     async def wrap_for_pytorch(
         self,
-        messages: List[Dict],
+        messages: list[dict],
         chat_template,
         tokenizer,
         sequence_start,
-        tools: Optional[List[object]] = None,
-        chat_template_kwargs: Optional[Dict] = None,
-    ) -> List[Dict]:
+        tools: list[object] | None = None,
+        chat_template_kwargs: dict | None = None,
+    ) -> list[dict]:
         """
         Args:
-            messages (List[Dict]): a list of message, which is supposed to be
+            messages (list[dict]): a list of message, which is supposed to be
                 the output of `preprocess`
+
         Returns:
-            a dict which will be passed to pytorch engine_instance's forward.
-            The dict is like the following:
-            Dict(
-                'prompt': 'the prompt after applying chat template'
-                'input_ids': [],
-                'multimodal': {
-                    'pixel_values': torch.Tensor,
-                    ...
-                ]
-            )
+            list[dict]: a list of dicts passed to pytorch engine_instance's forward.
+                Each dict has the following structure::
+
+                    {
+                        'prompt': 'the prompt after applying chat template',
+                        'input_ids': [],
+                        'multimodal': {
+                            'pixel_values': torch.Tensor,
+                            ...
+                        },
+                    }
         """
         has_input_ids = self.model.has_input_ids(messages)
         if not has_input_ids:
@@ -110,32 +112,35 @@ async def wrap_for_pytorch(
             result = self.model.to_pytorch_with_input_ids(messages)
         # clear data
         for i, message in enumerate(messages):
-            if isinstance(message['content'], List):
+            if isinstance(message['content'], list):
                 messages[i]['preprocess'] = None
         return result
 
     async def wrap_for_turbomind(
         self,
-        messages: List[Dict],
+        messages: list[dict],
         chat_template,
         tokenizer,
         sequence_start,
-        tools: Optional[List[object]] = None,
-        chat_template_kwargs: Optional[Dict] = None,
-    ) -> Dict:
+        tools: list[object] | None = None,
+        chat_template_kwargs: dict | None = None,
+    ) -> dict:
         """
         Args:
-            messages (List[Dict]): a list of message, which is supposed to be
+            messages (list[dict]): a list of message, which is supposed to be
                 the output of `async_infer`
+
         Returns:
-            a dict which will be passed to pytorch engine_instance's forward.
-            The dict is like the following:
-            Dict(
-                'prompt': 'the prompt after applying chat template'
-                'input_ids': [],
-                'input_embeddings': list[torch.Tensor],
-                'input_embedding_ranges': list[torch.Tensor],
-                ...
+            dict: a dict passed to turbomind engine_instance's forward.
+                The dict has the following structure::
+
+                    {
+                        'prompt': 'the prompt after applying chat template',
+                        'input_ids': [],
+                        'input_embeddings': list[torch.Tensor],
+                        'input_embedding_ranges': list[torch.Tensor],
+                        ...
+                    }
         """
         result = self.model.to_turbomind(messages,
                                          chat_template,
@@ -145,7 +150,7 @@ async def wrap_for_turbomind(
                                          chat_template_kwargs=chat_template_kwargs)
         # clear data
         for i, message in enumerate(messages):
-            if isinstance(message['content'], List):
+            if isinstance(message['content'], list):
                 messages[i]['preprocess'] = None
                 messages[i]['forward'] = None
         return result
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
index 521cbc7985..b86c34010e 100644
--- a/lmdeploy/vl/model/base.py
+++ b/lmdeploy/vl/model/base.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
 from itertools import groupby
-from typing import Dict, List, Union
 
 import numpy as np
 from mmengine import Registry
@@ -14,12 +13,12 @@
 
 class VisionModel(ABC):
     """Visual model which extract image feature."""
-    _arch: Union[str, List[str]] = None
+    _arch: str | list[str] = None
 
     def __init__(self,
                  model_path: str,
                  with_llm: bool = False,
-                 max_memory: Dict[int, int] = None,
+                 max_memory: dict[int, int] = None,
                  hf_config: AutoConfig = None,
                  backend: str = ''):
         """init."""
@@ -62,7 +61,7 @@ def build_model(self, ):
             raise NotImplementedError()
 
     @abstractmethod
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Preprocess multimodal data in the messages.
 
         The derived class,
@@ -71,7 +70,7 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         It can integrate the result into the messages list, or insert it to
         the individual image item.
         Args:
-            message(Dict): multimodal data in a dict, which is as follows:
+            message(dict): multimodal data in a dict, which is as follows:
             [
                 {'role': 'user', 'content': 'user prompt'},
                 {'role': 'assisant', 'content': 'AI reponse'},
@@ -105,24 +104,24 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         """  # noqa
         raise NotImplementedError()
 
-    def has_input_ids(self, messages: List[Dict]) -> bool:
+    def has_input_ids(self, messages: list[dict]) -> bool:
         """Check whether the messages contain input_ids directly.
 
         Args:
-            messages (List[Dict]): a list of message, which is supposed to be
+            messages (list[dict]): a list of message, which is supposed to be
                 the output of `preprocess`
         Returns:
             bool: whether the messages contain input_ids directly
         """
         users = [x['content'] for x in messages if x['role'] == 'user']
-        return len(users) == 1 and isinstance(users[0], List) and isinstance(users[0][0].get('text', ''), List)
+        return len(users) == 1 and isinstance(users[0], list) and isinstance(users[0][0].get('text', ''), list)
 
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
@@ -138,7 +137,7 @@ def to_pytorch(self, messages, chat_template, tokenizer, sequence_start, chat_te
         pytorch engine.
 
         Args:
-            messages(List[Dict]): the output of `preprocess`
+            messages(list[dict]): the output of `preprocess`
             chat_template: the chat template defined in `lmdeploy/model.py`
             tokenzer: the tokenizer model
             sequence_start: starting flag of a sequence
@@ -154,7 +153,7 @@ def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, chat_
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the output of `preprocess`
+            messages(list[dict]): the output of `preprocess`
             chat_template: the chat template defined in `lmdeploy/model.py`
             tokenzer: the tokenizer model
             sequence_start: starting flag of a sequence
@@ -171,13 +170,13 @@ def collect_images(messages):
         to RGB color space.
 
         Args:
-            messages (List[Tuple[Image, Dict]]): a list of images with their
+            messages (list[tuple[Image, dict]]): a list of images with their
                 corresponding parameters
         """  # noqa
         images = []
         for message in messages:
             content = message['content']
-            if not isinstance(content, List):
+            if not isinstance(content, list):
                 continue
             images.extend([(x['image'], {
                 k: v
@@ -191,13 +190,13 @@ def collect_time_series(messages):
         from the messages and compile them into a single list.
 
         Args:
-            messages (List[Tuple[np.ndarray, Dict]]): a list of time
+            messages (list[tuple[np.ndarray, dict]]): a list of time
                 series data with their corresponding parameters
         """  # noqa
         time_series = []
         for message in messages:
             content = message['content']
-            if not isinstance(content, List):
+            if not isinstance(content, list):
                 continue
             time_series.extend([(x['time_series'], {
                 k: v
@@ -210,7 +209,7 @@ def IMAGE_TOKEN_included(messages):
         """Check whether the IMAGE_TOKEN is included in the messages.
 
         Args:
-            messages (List[Dict]): a list of message
+            messages (list[dict]): a list of message
         Returns:
             bool: whether the IMAGE_TOKEN is included in the messages
         """
@@ -220,7 +219,7 @@ def IMAGE_TOKEN_included(messages):
                 continue
             if isinstance(content, str) and '<IMAGE_TOKEN>' in content:
                 return True
-            elif isinstance(content, List):
+            elif isinstance(content, list):
                 content = [x['text'] for x in content if x['type'] == 'text']
                 if any('<IMAGE_TOKEN>' in x for x in content):
                     return True
@@ -231,7 +230,7 @@ def to_pytorch_with_input_ids(self, messages):
         required by pytorch engine when input_ids are provided directly.
 
         Args:
-            messages(List[Dict]): the output of `preprocess`
+            messages(list[dict]): the output of `preprocess`
         """
         # collect all preprocessing result from messages
         preps = [x['content'] for x in messages if x['role'] == 'preprocess']
@@ -268,7 +267,7 @@ def to_pytorch_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_star
         compatible with what is required by pytorch engine.
 
         Args:
-            messages(List[Dict]): the output of `preprocess`
+            messages(list[dict]): the output of `preprocess`
             prompt(str): the prompt after applying chat template
             IMAGE_TOKEN(str): a placeholder where image tokens will be
                 inserted
@@ -303,7 +302,7 @@ def to_turbomind_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_st
         compatible with what is required by turbomind engine.
 
         Args:
-            messages(List[Dict]): the output of `preprocess`
+            messages(list[dict]): the output of `preprocess`
             prompt(str): the prompt after applying chat template
             IMAGE_TOKEN(str): a placeholder where image tokens will be
                 inserted
diff --git a/lmdeploy/vl/model/builder.py b/lmdeploy/vl/model/builder.py
index 11262483e5..04ac5ab759 100644
--- a/lmdeploy/vl/model/builder.py
+++ b/lmdeploy/vl/model/builder.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
-from typing import Optional, Union
 
 import torch
 
@@ -40,7 +39,7 @@
 def load_vl_model(model_path: str,
                   backend: str,
                   with_llm: bool = False,
-                  backend_config: Optional[Union[TurbomindEngineConfig, PytorchEngineConfig]] = None):
+                  backend_config: TurbomindEngineConfig | PytorchEngineConfig | None = None):
     """Load visual model.
 
     Args:
diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/vl/model/cogvlm.py
index ca5e41a96f..03f2ef224f 100644
--- a/lmdeploy/vl/model/cogvlm.py
+++ b/lmdeploy/vl/model/cogvlm.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List
 
 from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
@@ -39,7 +38,7 @@ def build_model(self):
         else:
             raise NotImplementedError('turbomind has not supported cogvlm yet')
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refer to the spec of `super().preprocess`"""
         images = self.collect_images(messages)
         outputs = []
diff --git a/lmdeploy/vl/model/deepseek.py b/lmdeploy/vl/model/deepseek.py
index 0a1f6c12e9..154eb95c4f 100644
--- a/lmdeploy/vl/model/deepseek.py
+++ b/lmdeploy/vl/model/deepseek.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
-from typing import Dict, List
 
 import torch
 from transformers import AutoModelForCausalLM
@@ -86,7 +85,7 @@ def build_model(self):
         self.vision_model = model.vision_model.eval()
         self.aligner = model.aligner.eval()
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refers to the spec of `super.preprocess()"""
         images = self.collect_images(messages)
         outputs = []
@@ -105,12 +104,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
diff --git a/lmdeploy/vl/model/deepseek_vl2.py b/lmdeploy/vl/model/deepseek_vl2.py
index a2e4b034ca..13fb71c950 100644
--- a/lmdeploy/vl/model/deepseek_vl2.py
+++ b/lmdeploy/vl/model/deepseek_vl2.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 from contextlib import redirect_stdout
-from typing import Dict, List
 
 import torch
 from transformers import AutoConfig
@@ -67,7 +66,7 @@ def build_model(self):
         # TODO, implement for tubomind engine
         raise NotImplementedError()
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refers to the spec of `super.preprocess()"""
         images = self.collect_images(messages)
 
@@ -80,8 +79,8 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
             formatted_messages.append(dict(role=message['role'], content=text_content, images=image_content))
 
         # NOTE: DeepseekVLV2Processor inputs
-        # conversations (List[Dict]): conversations with a list of messages;
-        # images (List[ImageType]): the list of images;
+        # conversations (list[dict]): conversations with a list of messages;
+        # images (list[ImageType]): the list of images;
         # force_batchify (bool): force batchify the inputs;
         # inference_mode (bool): if True, then remove the last eos token;
         prepare = self.image_processor(conversations=formatted_messages,
@@ -103,12 +102,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
diff --git a/lmdeploy/vl/model/gemma3_vl.py b/lmdeploy/vl/model/gemma3_vl.py
index c2879a6b83..4a0aa7a45c 100644
--- a/lmdeploy/vl/model/gemma3_vl.py
+++ b/lmdeploy/vl/model/gemma3_vl.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Optional
 
 import torch
 from transformers import AutoConfig, AutoProcessor
@@ -12,11 +11,11 @@
 
 
 class Gemma3ImagesKwargs(ImagesKwargs):
-    do_pan_and_scan: Optional[bool]
-    pan_and_scan_min_crop_size: Optional[int]
-    pan_and_scan_max_num_crops: Optional[int]
-    pan_and_scan_min_ratio_to_activate: Optional[float]
-    do_convert_rgb: Optional[bool]
+    do_pan_and_scan: bool | None
+    pan_and_scan_min_crop_size: int | None
+    pan_and_scan_max_num_crops: int | None
+    pan_and_scan_min_ratio_to_activate: float | None
+    do_convert_rgb: bool | None
 
 
 class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
@@ -43,7 +42,7 @@ class Gemma3VisionModel(VisionModel):
     def __init__(self,
                  model_path: str,
                  with_llm: bool = False,
-                 max_memory: Dict[int, int] = None,
+                 max_memory: dict[int, int] = None,
                  hf_config: AutoConfig = None,
                  backend: str = ''):
         super().__init__(model_path, with_llm, max_memory, hf_config, backend)
@@ -61,7 +60,7 @@ def build_model(self):
         # TODO, implement for tubomind engine
         raise NotImplementedError()
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refers to `super.preprocess() for spec."""
         from transformers.image_utils import make_nested_list_of_images
         output_kwargs = self.processor._merge_kwargs(
@@ -91,12 +90,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
diff --git a/lmdeploy/vl/model/glm4_1v.py b/lmdeploy/vl/model/glm4_1v.py
index 3b4b2ab937..27f5e30eeb 100644
--- a/lmdeploy/vl/model/glm4_1v.py
+++ b/lmdeploy/vl/model/glm4_1v.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List
 
 from transformers import AutoConfig
 
@@ -33,7 +32,7 @@ def build_preprocessor(self):
     def build_model(self):
         raise NotImplementedError('turbomind has not supported glm4v yet')
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refer to `super().preprocess()` for spec."""
         images = self.collect_images(messages)
         optional_keys = {'resized_height', 'resized_width', 'min_pixels', 'max_pixels'}
diff --git a/lmdeploy/vl/model/glm4_v.py b/lmdeploy/vl/model/glm4_v.py
index 81dffbf1ca..1c6af44c6e 100644
--- a/lmdeploy/vl/model/glm4_v.py
+++ b/lmdeploy/vl/model/glm4_v.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List
 
 from transformers import AutoConfig
 
@@ -44,11 +43,11 @@ def build_model(self):
         else:
             raise NotImplementedError('turbomind has not supported glm4v yet')
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refers to the spec of `super.preprocess()"""
         outputs = []
         for message in messages:
-            if not isinstance(message['content'], List):
+            if not isinstance(message['content'], list):
                 continue
             images = [x['image'] for x in message['content'] if x['type'] == 'image']
             if len(images) > 1:
diff --git a/lmdeploy/vl/model/interns1_pro.py b/lmdeploy/vl/model/interns1_pro.py
index e11efbb32a..b0daf4fc2e 100644
--- a/lmdeploy/vl/model/interns1_pro.py
+++ b/lmdeploy/vl/model/interns1_pro.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import numpy as np
 import torch
@@ -40,7 +40,7 @@ def build_preprocessor(self):
         self.ts_token = getattr(self.processor, 'ts_token', None)
         self.ts_token_id = getattr(self.processor, 'ts_token_id', None)
 
-    def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = None):
+    def get_processor_args(self, mm_processor_kwargs: dict[str, Any] | None = None):
         min_pixels = self.processor.image_processor.size['shortest_edge']
         max_pixels = self.processor.image_processor.size['longest_edge']
 
@@ -112,7 +112,7 @@ def time_series_processor(self, ts_input, sr):
 
         return dict(ts_values=[ts_input], ts_sr=[sr], ts_lens=[ts_len], num_ts_tokens=[num_ts_tokens])
 
-    def preprocess(self, messages: List[Dict], mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]:
+    def preprocess(self, messages: list[dict], mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
         """Refer to `super().preprocess()` for spec."""
 
         self.check_time_series_input(messages)
@@ -153,7 +153,7 @@ def proc_messages(self,
                       messages,
                       chat_template,
                       sequence_start,
-                      tools: Optional[List[object]] = None,
+                      tools: list[object] | None = None,
                       chat_template_kwargs=None):
         """Apply chat template to get the prompt."""
         chat_template_kwargs = chat_template_kwargs or {}
@@ -187,7 +187,7 @@ def ts_to_pytorch_aux(self, messages, prompt, TS_TOKEN, tokenizer, sequence_star
         compatible with what is required by pytorch engine.
 
         Args:
-            messages(List[Dict]): the output of `preprocess`
+            messages(list[dict]): the output of `preprocess`
             prompt(str): the prompt after applying chat template
             TS_TOKEN(str): a placeholder where time series tokens will be
                 inserted
@@ -231,8 +231,8 @@ def to_pytorch(self,
                    chat_template,
                    tokenizer,
                    sequence_start,
-                   tools: Optional[List[object]] = None,
-                   chat_template_kwargs: Optional[Dict] = None,
+                   tools: list[object] | None = None,
+                   chat_template_kwargs: dict | None = None,
                    **kwargs):
         """Return to the information needed by pytorch engine."""
         if self.has_time_series_input:
@@ -257,7 +257,7 @@ def build_model(self):
         pass
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         # TODO: implement for turbomind
         pass
 
@@ -266,7 +266,7 @@ def to_turbomind(self,
                      chat_template,
                      tokenizer,
                      sequence_start,
-                     chat_template_kwargs: Optional[Dict] = None,
+                     chat_template_kwargs: dict | None = None,
                      **kwargs):
         # TODO: implement for turbomind
         pass
diff --git a/lmdeploy/vl/model/internvl.py b/lmdeploy/vl/model/internvl.py
index 9036866818..1c170daecb 100644
--- a/lmdeploy/vl/model/internvl.py
+++ b/lmdeploy/vl/model/internvl.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Optional
 
 import torch
 from transformers import AutoConfig, AutoModel, AutoTokenizer, CLIPImageProcessor
@@ -72,7 +71,7 @@ class InternVLVisionModel(VisionModel):
     def __init__(self,
                  model_path: str,
                  with_llm: bool = False,
-                 max_memory: Dict[int, int] = None,
+                 max_memory: dict[int, int] = None,
                  hf_config: AutoConfig = None,
                  backend: str = ''):
         super().__init__(model_path, with_llm, max_memory, hf_config, backend)
@@ -190,7 +189,7 @@ def _forward(self, inputs, max_batch_size):
             outputs.extend([x.squeeze() for x in feats])
         return outputs
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refers to `super.preprocess() for spec."""
         images = self.collect_images(messages)
         outputs = []
@@ -207,12 +206,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
@@ -229,8 +228,8 @@ def proc_messages(
         messages,
         chat_template,
         sequence_start,
-        tools: Optional[List[object]] = None,
-        chat_template_kwargs: Optional[Dict] = None,
+        tools: list[object] | None = None,
+        chat_template_kwargs: dict | None = None,
     ):
         chat_template_kwargs = chat_template_kwargs or {}
         """Apply chat template to get the prompt."""
@@ -272,8 +271,8 @@ def to_pytorch(self,
                    chat_template,
                    tokenizer,
                    sequence_start,
-                   tools: Optional[List[object]] = None,
-                   chat_template_kwargs: Optional[Dict] = None,
+                   tools: list[object] | None = None,
+                   chat_template_kwargs: dict | None = None,
                    **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages,
                                                  chat_template,
@@ -287,8 +286,8 @@ def to_turbomind(self,
                      chat_template,
                      tokenizer,
                      sequence_start,
-                     tools: Optional[List[object]] = None,
-                     chat_template_kwargs: Optional[Dict] = None,
+                     tools: list[object] | None = None,
+                     chat_template_kwargs: dict | None = None,
                      **kwargs):
         prompt, IMAGE_TOKEN = self.proc_messages(messages,
                                                  chat_template,
diff --git a/lmdeploy/vl/model/internvl3_hf.py b/lmdeploy/vl/model/internvl3_hf.py
index 85eb40bbdc..c28d278de6 100644
--- a/lmdeploy/vl/model/internvl3_hf.py
+++ b/lmdeploy/vl/model/internvl3_hf.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Optional
 
 import torch
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoProcessor
@@ -13,9 +12,9 @@
 
 
 class InternVLImagesKwargs(ImagesKwargs, total=False):
-    crop_to_patches: Optional[bool]
-    min_patches: Optional[int]
-    max_patches: Optional[int]
+    crop_to_patches: bool | None
+    min_patches: int | None
+    max_patches: int | None
 
 
 class InternVLProcessorKwargs(ProcessingKwargs, total=False):
@@ -40,7 +39,7 @@ class InternVL3VisionModel(InternVLVisionModel):
     def __init__(self,
                  model_path: str,
                  with_llm: bool = False,
-                 max_memory: Dict[int, int] = None,
+                 max_memory: dict[int, int] = None,
                  hf_config: AutoConfig = None,
                  backend: str = ''):
         super().__init__(model_path, with_llm, max_memory, hf_config, backend)
@@ -83,7 +82,7 @@ def build_model(self):
         # avoid randomness in inference.
         self.model = model.eval()
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refers to `super.preprocess() for spec."""
         from transformers.image_utils import make_flat_list_of_images
         output_kwargs = self.processor._merge_kwargs(
@@ -116,12 +115,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
diff --git a/lmdeploy/vl/model/internvl_llava.py b/lmdeploy/vl/model/internvl_llava.py
index 67cabfa087..d521bab9fb 100644
--- a/lmdeploy/vl/model/internvl_llava.py
+++ b/lmdeploy/vl/model/internvl_llava.py
@@ -2,7 +2,6 @@
 
 import warnings
 from contextlib import contextmanager
-from typing import Dict, List
 
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM
@@ -125,17 +124,17 @@ def build_model(self):
         self.vision_tower = model.model.vision_tower.eval()
         self.mm_projector = model.model.mm_projector.eval()
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refer to `super().preprocess() for spec."""
         return super().preprocess(messages)
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
diff --git a/lmdeploy/vl/model/llama4.py b/lmdeploy/vl/model/llama4.py
index e0752d7b99..d0e03fd16e 100644
--- a/lmdeploy/vl/model/llama4.py
+++ b/lmdeploy/vl/model/llama4.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List
 
 import torch
 from transformers import AutoConfig
@@ -58,7 +57,7 @@ def build_model(self):
         # TODO, implement for tubomind engine
         raise NotImplementedError()
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refers to `super.preprocess() for spec."""
         images = self.collect_images(messages)
         outputs = []
@@ -84,12 +83,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
@@ -123,7 +122,7 @@ def to_pytorch_aux(self, messages, prompt, IMAGE_TOKEN, tokenizer, sequence_star
         compatible with what is required by pytorch engine.
 
         Args:
-            messages(List[Dict]): the output of `preprocess`
+            messages(list[dict]): the output of `preprocess`
             prompt(str): the prompt after applying chat template
             IMAGE_TOKEN(str): a placeholder where image tokens will be
                 inserted
diff --git a/lmdeploy/vl/model/llava.py b/lmdeploy/vl/model/llava.py
index 91da486643..f10bfadf32 100644
--- a/lmdeploy/vl/model/llava.py
+++ b/lmdeploy/vl/model/llava.py
@@ -5,7 +5,6 @@
 import math
 import warnings
 from contextlib import contextmanager
-from typing import Dict, List
 
 import torch
 from PIL import Image
@@ -295,7 +294,7 @@ def encode_images(self, images: torch.Tensor) -> torch.Tensor:
         image_features = self.mm_projector(image_features)
         return image_features
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refer to `super().preprocess() for spec."""
         images = self.collect_images(messages)
         outputs = []
@@ -311,12 +310,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
diff --git a/lmdeploy/vl/model/llava_hf.py b/lmdeploy/vl/model/llava_hf.py
index c66f68be68..80476b55fb 100644
--- a/lmdeploy/vl/model/llava_hf.py
+++ b/lmdeploy/vl/model/llava_hf.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
-from typing import Dict, List
 
 import torch
 from transformers import AutoProcessor
@@ -55,7 +54,7 @@ def build_model(self):
         model.eval()
         self.model = model
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refers to `super.preprocess() for spec."""
         images = self.collect_images(messages)
         outputs = []
@@ -71,12 +70,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
diff --git a/lmdeploy/vl/model/llava_next.py b/lmdeploy/vl/model/llava_next.py
index b705f237b8..dd23572d05 100644
--- a/lmdeploy/vl/model/llava_next.py
+++ b/lmdeploy/vl/model/llava_next.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import itertools
 import warnings
-from typing import Dict, List
 
 import torch
 
@@ -63,7 +62,7 @@ def build_model(self):
                                          dtype=torch.half)
         self.model.eval()
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refers to the spec of `super.preprocess()"""
         from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches
         images = self.collect_images(messages)
@@ -99,12 +98,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
diff --git a/lmdeploy/vl/model/minicpmv.py b/lmdeploy/vl/model/minicpmv.py
index b746f345ba..09d00fc298 100644
--- a/lmdeploy/vl/model/minicpmv.py
+++ b/lmdeploy/vl/model/minicpmv.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import itertools
 import warnings
-from typing import Dict, List
 
 import torch
 from PIL.Image import Image
@@ -23,7 +22,7 @@ class MiniCPMVModel(VisionModel):
     def __init__(self,
                  model_path: str,
                  with_llm: bool = False,
-                 max_memory: Dict[int, int] = None,
+                 max_memory: dict[int, int] = None,
                  hf_config: AutoConfig = None,
                  backend: str = ''):
         super().__init__(model_path, with_llm, max_memory, hf_config, backend)
@@ -94,7 +93,7 @@ def _reshape_by_patch(self, slice_images):
             tgt_sizes.append(torch.Tensor([H, W]).type(torch.int32))
         return patches, tgt_sizes
 
-    def _preprocess_v2_5(self, image: Image, params: Dict = None) -> Dict:
+    def _preprocess_v2_5(self, image: Image, params: dict = None) -> dict:
         """Image preprocessing for MiniCPM-Llama3-V-2_5."""
         slice_images, best_grid = self._get_slice_image(image)
         # pixel_values, tgt_sizes are list of torch tensors
@@ -108,7 +107,7 @@ def _preprocess_v2_5(self, image: Image, params: Dict = None) -> Dict:
             image_tokens=1,
             image_token_id=self.image_token_id)
 
-    def _preprocess_v2_6(self, image: Image, params: Dict = None) -> Dict:
+    def _preprocess_v2_6(self, image: Image, params: dict = None) -> dict:
         """Image preprocessing for MiniCPM-V-2_6."""
         max_slice_nums = self.image_processor.max_slice_nums
         use_image_id = self.image_processor.use_image_id
@@ -130,11 +129,11 @@ def _preprocess_v2_6(self, image: Image, params: Dict = None) -> Dict:
             image_token_id=self.image_token_id,
             use_image_id=use_image_id)
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refer to `super().preprocess() for spec."""
         outputs = []
         for i, message in enumerate(messages):
-            if message['role'] != 'user' or not isinstance(message['content'], List):
+            if message['role'] != 'user' or not isinstance(message['content'], list):
                 continue
             for item in message['content']:
                 if item['type'] == 'image':
@@ -146,12 +145,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py
index ab0949fe03..bcf0070ec9 100644
--- a/lmdeploy/vl/model/mllama.py
+++ b/lmdeploy/vl/model/mllama.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Dict, List
 
 from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
 
@@ -24,7 +23,7 @@ def build_preprocessor(self):
         self.processor = AutoProcessor.from_pretrained(self.model_path)
         self.image_token_id = 128256
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refer to the spec of `super().preprocess`"""
         images = self.collect_images(messages)
         outputs = []
diff --git a/lmdeploy/vl/model/molmo.py b/lmdeploy/vl/model/molmo.py
index c2a12e8412..90b8cb932a 100644
--- a/lmdeploy/vl/model/molmo.py
+++ b/lmdeploy/vl/model/molmo.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Dict, List
 
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
@@ -50,10 +49,10 @@ def build_model(self):
         # avoid randomness in inference.
         self.model = model.eval()
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refer to the `super.preprocess() for spec."""
         for i, message in enumerate(messages):
-            if not isinstance(message['content'], List):
+            if not isinstance(message['content'], list):
                 continue
             images = [x['image'] for x in message['content'] if x['type'] == 'image']
             content = [x.get('text', '') for x in message['content'] if x['type'] == 'text']
@@ -75,12 +74,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
@@ -131,7 +130,7 @@ def proc_messages(messages):
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
         for message in messages:
             role, content = message['role'], message['content']
-            if isinstance(content, List):
+            if isinstance(content, list):
                 n_images = len([1 for x in content if x['type'] == 'image'])
                 content = [x['text'] for x in content if x['type'] == 'text']
                 prompt.append(' User: ' + (IMAGE_TOKEN + '\n') * n_images + content[0])
@@ -160,7 +159,7 @@ def to_turbomind(self, messages, chat_template, tokenizer, sequence_start, **kwa
         for i, message in enumerate(messages):
             prompt = ''
             role, content = message['role'], message['content']
-            if isinstance(content, List):
+            if isinstance(content, list):
                 forward_result = message.pop('forward')
                 input_ids = forward_result['input_ids']
                 embeddings = forward_result['embeddings']
diff --git a/lmdeploy/vl/model/phi3_vision.py b/lmdeploy/vl/model/phi3_vision.py
index 683220c29c..77736ffbd4 100644
--- a/lmdeploy/vl/model/phi3_vision.py
+++ b/lmdeploy/vl/model/phi3_vision.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Dict, List
 
 from transformers import AutoProcessor
 
@@ -29,7 +28,7 @@ def build_model(self):
         else:
             raise NotImplementedError('turbomind has not supported phi3v yet')
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refers to `super.preprocess() for spec."""
         images = self.collect_images(messages)
         outputs = []
diff --git a/lmdeploy/vl/model/qwen.py b/lmdeploy/vl/model/qwen.py
index db54fb5f57..8dfcd85021 100644
--- a/lmdeploy/vl/model/qwen.py
+++ b/lmdeploy/vl/model/qwen.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
-from typing import Dict, List
 
 import torch
 from transformers import AutoModelForCausalLM
@@ -69,7 +68,7 @@ def build_model(self):
 
         self.model = model.transformer.visual.eval()
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refers to `super.preprocess() for spec."""
         images = self.collect_images(messages)
         outputs = []
@@ -85,12 +84,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
diff --git a/lmdeploy/vl/model/qwen3.py b/lmdeploy/vl/model/qwen3.py
index 265bf9f729..729cf0ebbd 100644
--- a/lmdeploy/vl/model/qwen3.py
+++ b/lmdeploy/vl/model/qwen3.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, Dict, List, Optional
+from typing import Any
 
 import torch
 from transformers import AutoProcessor
@@ -32,7 +32,7 @@ def build_preprocessor(self):
         self.image_token_id = tokenizer.encode(self.image_token)[-1]
         self.mm_processor_kwargs = None
 
-    def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = None):
+    def get_processor_args(self, mm_processor_kwargs: dict[str, Any] | None = None):
         min_pixels = self.processor.image_processor.size['shortest_edge']
         max_pixels = self.processor.image_processor.size['longest_edge']
 
@@ -68,7 +68,7 @@ def get_processor_args(self, mm_processor_kwargs: Optional[Dict[str, Any]] = Non
 
         return min_pixels, max_pixels
 
-    def preprocess(self, messages: List[Dict], mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict]:
+    def preprocess(self, messages: list[dict], mm_processor_kwargs: dict[str, Any] | None = None) -> list[dict]:
         """Refer to `super().preprocess()` for spec."""
 
         min_pixels, max_pixels = self.get_processor_args(mm_processor_kwargs)
@@ -118,7 +118,7 @@ def to_pytorch(self,
                    chat_template,
                    tokenizer,
                    sequence_start,
-                   chat_template_kwargs: Optional[Dict] = None,
+                   chat_template_kwargs: dict | None = None,
                    **kwargs):
         """Return to the information needed by pytorch engine."""
         prompt, IMAGE_TOKEN = self.proc_messages(messages, chat_template, sequence_start, chat_template_kwargs)
@@ -129,7 +129,7 @@ def build_model(self):
         pass
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         # TODO: implement for turbomind
         pass
 
@@ -138,7 +138,7 @@ def to_turbomind(self,
                      chat_template,
                      tokenizer,
                      sequence_start,
-                     chat_template_kwargs: Optional[Dict] = None,
+                     chat_template_kwargs: dict | None = None,
                      **kwargs):
         # TODO: implement for turbomind
         pass
diff --git a/lmdeploy/vl/model/utils.py b/lmdeploy/vl/model/utils.py
index e584c54c71..9791d8a9ba 100644
--- a/lmdeploy/vl/model/utils.py
+++ b/lmdeploy/vl/model/utils.py
@@ -1,8 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import inspect
+from collections.abc import Callable, MutableSequence
 from contextlib import contextmanager
-from typing import Callable, MutableSequence
 
 import torch
 
diff --git a/lmdeploy/vl/model/xcomposer2.py b/lmdeploy/vl/model/xcomposer2.py
index cff3b808f5..2117b151dc 100644
--- a/lmdeploy/vl/model/xcomposer2.py
+++ b/lmdeploy/vl/model/xcomposer2.py
@@ -5,7 +5,7 @@
 import sys
 import warnings
 from contextlib import contextmanager
-from typing import Any, Dict, List, Tuple
+from typing import Any
 
 import torch
 from PIL.Image import Image
@@ -35,7 +35,7 @@ class ModelType(enum.Enum):
     XCOMPOSER2D5 = enum.auto()
 
 
-def get_xcomposer_type(model_path: str) -> Tuple[ModelType, Any]:
+def get_xcomposer_type(model_path: str) -> tuple[ModelType, Any]:
     """Get xcomposer type."""
     from transformers.dynamic_module_utils import get_class_from_dynamic_module
     match_modules = {
@@ -90,7 +90,7 @@ class Xcomposer2VisionModel(VisionModel):
     def __init__(self,
                  model_path: str,
                  with_llm: bool = False,
-                 max_memory: Dict[int, int] = None,
+                 max_memory: dict[int, int] = None,
                  hf_config: AutoConfig = None,
                  backend: str = ''):
         model_path = model_path.rstrip(os.sep)
@@ -180,7 +180,7 @@ def build_model(self):
 
         self.model = model.eval()
 
-    def _preprocess_2d5(self, image: Image, params: Dict) -> Dict:
+    def _preprocess_2d5(self, image: Image, params: dict) -> dict:
         """Image preprocessing for internlm-xcomposer2d5-7b."""
         hd_num = params.get('hd_num', 24)
         image = self.HD_transform(image, hd_num=hd_num)
@@ -190,12 +190,12 @@ def _preprocess_2d5(self, image: Image, params: Dict) -> Dict:
         n_token_per_image = int((h * w + 1) * 400 + 1 + (h + 1) * 20)
         return pixel_values, n_token_per_image
 
-    def _preprocess_7b(self, image: Image, params: Dict) -> Dict:
+    def _preprocess_7b(self, image: Image, params: dict) -> dict:
         """Image preprocessing for internlm-xcomposer2-7b."""
         pixel_values = self.vis_processor(image).unsqueeze(0).half()
         return pixel_values, 256
 
-    def _preprocess_4khd_7b(self, image: Image, params: Dict) -> Dict:
+    def _preprocess_4khd_7b(self, image: Image, params: dict) -> dict:
         """Image preprocessing for internlm-xcomposer2-4khd-7b."""
         image = self.HD_transform(image, hd_num=25)
         pixel_values = self.vis_processor(image).unsqueeze(0).half()
@@ -204,7 +204,7 @@ def _preprocess_4khd_7b(self, image: Image, params: Dict) -> Dict:
         n_token_per_image = int((h * w + 1) * 144 + 1 + (h + 1) * 12)
         return pixel_values, n_token_per_image
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refer to `super().preprocess() for spec."""
         images = self.collect_images(messages)
         outputs = []
@@ -220,12 +220,12 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         return messages
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
diff --git a/lmdeploy/vl/model/yi.py b/lmdeploy/vl/model/yi.py
index 02dd1c83e5..26e52036fb 100644
--- a/lmdeploy/vl/model/yi.py
+++ b/lmdeploy/vl/model/yi.py
@@ -3,7 +3,6 @@
 import os
 from contextlib import contextmanager
 from os import path as osp
-from typing import Dict, List
 
 import torch.nn as nn
 from transformers import AutoConfig
@@ -117,7 +116,7 @@ def build_model(self):
         with init_yi_model(), disable_transformers_logging():
             super().build_model()
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refer to `super().preprocess() for spec."""
         images = self.collect_images(messages)
         outputs = []
diff --git a/lmdeploy/vl/time_series_utils.py b/lmdeploy/vl/time_series_utils.py
index 5651f65697..bc0b2c0136 100644
--- a/lmdeploy/vl/time_series_utils.py
+++ b/lmdeploy/vl/time_series_utils.py
@@ -136,7 +136,7 @@ def _load_csv(source: bytes | str) -> np.ndarray:
     if isinstance(source, bytes):
         text = source.decode('utf-8')
     else:
-        with open(source, 'r', newline='') as f:
+        with open(source, newline='') as f:
             text = f.read()
 
     # Parse CSV
diff --git a/lmdeploy/vl/utils.py b/lmdeploy/vl/utils.py
index a089d06ad7..6a68ae3c90 100644
--- a/lmdeploy/vl/utils.py
+++ b/lmdeploy/vl/utils.py
@@ -1,7 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 from io import BytesIO
-from typing import Union
 
 import pybase64
 import requests
@@ -12,7 +11,7 @@
 logger = get_logger('lmdeploy')
 
 
-def encode_image_base64(image: Union[str, Image.Image]) -> str:
+def encode_image_base64(image: str | Image.Image) -> str:
     """Encode raw data to base64 format."""
     buffered = BytesIO()
     FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10))
@@ -44,12 +43,12 @@ def encode_image_base64(image: Union[str, Image.Image]) -> str:
     return res
 
 
-def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
+def load_image_from_base64(image: bytes | str) -> Image.Image:
     """Load image from base64 format."""
     return Image.open(BytesIO(pybase64.b64decode(image)))
 
 
-def load_image(image_url: Union[str, Image.Image]) -> Image.Image:
+def load_image(image_url: str | Image.Image) -> Image.Image:
     """Load image from url, local path or openai GPT4V."""
     FETCH_TIMEOUT = int(os.environ.get('LMDEPLOY_FETCH_TIMEOUT', 10))
     headers = {
diff --git a/pyproject.toml b/pyproject.toml
index d16c624fbb..43b200dd4c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,3 +3,20 @@ requires = [
     "cmake_build_extension",
 ]
 build-backend = "setuptools.build_meta"
+
+[tool.ruff]
+line-length = 120
+target-version = "py310"
+extend-exclude = [
+  "third_party",
+  "src/turbomind",
+]
+
+[tool.ruff.lint]
+select = [
+  "E", "F", "I", "W",
+  "UP",
+]
+ignore = [
+  "E231", "E741"
+]
diff --git a/setup.py b/setup.py
index e23be7630b..42012da815 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,7 @@ def readme():
 def get_version():
     file_path = os.path.join(pwd, version_file)
     pattern = re.compile(r"\s*__version__\s*=\s*'([0-9A-Za-z.-]+)'")
-    with open(file_path, 'r') as f:
+    with open(file_path) as f:
         for line in f:
             m = pattern.match(line)
             if m:
@@ -64,7 +64,7 @@ def parse_requirements(fname='requirements.txt', with_version=True):
         with_version (bool, default=False): if True include version specs
 
     Returns:
-        List[str]: list of requirements items
+        list[str]: list of requirements items
 
     CommandLine:
         python -c "import setup; print(setup.parse_requirements())"
@@ -104,12 +104,11 @@ def parse_line(line):
             yield info
 
     def parse_require_file(fpath):
-        with open(fpath, 'r') as f:
+        with open(fpath) as f:
             for line in f.readlines():
                 line = line.strip()
                 if line and not line.startswith('#'):
-                    for info in parse_line(line):
-                        yield info
+                    yield from parse_line(line)
 
     def gen_packages_items():
         if os.path.exists(require_fpath):
@@ -130,7 +129,7 @@ def gen_packages_items():
     return packages
 
 
-if get_target_device() == 'cuda' and not os.getenv('DISABLE_TURBOMIND', '').lower() in ('yes', 'true', 'on', 't', '1'):
+if get_target_device() == 'cuda' and os.getenv('DISABLE_TURBOMIND', '').lower() not in ('yes', 'true', 'on', 't', '1'):
     import cmake_build_extension
 
     ext_modules = [
diff --git a/tests/pytorch/engine/test_logits_process.py b/tests/pytorch/engine/test_logits_process.py
index d635ddcf10..01c6f9777c 100644
--- a/tests/pytorch/engine/test_logits_process.py
+++ b/tests/pytorch/engine/test_logits_process.py
@@ -1,7 +1,12 @@
 # yapf: disable
 import torch
-from transformers.generation.logits_process import (MinPLogitsWarper, RepetitionPenaltyLogitsProcessor,
-                                                    TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper)
+from transformers.generation.logits_process import (
+    MinPLogitsWarper,
+    RepetitionPenaltyLogitsProcessor,
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+)
 
 # yapf: enable
 
diff --git a/tests/test_lmdeploy/test_harmony_gpt_oss_parser.py b/tests/test_lmdeploy/test_harmony_gpt_oss_parser.py
index e1e5efeb71..7624ff4d17 100644
--- a/tests/test_lmdeploy/test_harmony_gpt_oss_parser.py
+++ b/tests/test_lmdeploy/test_harmony_gpt_oss_parser.py
@@ -4,7 +4,7 @@
 import sys
 import time
 import types
-from typing import Generator, List
+from collections.abc import Generator
 
 import pytest
 import shortuuid
@@ -114,11 +114,15 @@ def process(self, token):
         self.last_content_delta = chr(token)
 
 
-def _chat_completion_v1(request, token_chunks: List[List[int]]):
+def _chat_completion_v1(request, token_chunks: list[list[int]]):
     from lmdeploy.serve.openai.harmony_utils import GptOssChatParser
-    from lmdeploy.serve.openai.protocol import (ChatCompletionResponse, ChatCompletionResponseChoice,
-                                                ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse,
-                                                UsageInfo)
+    from lmdeploy.serve.openai.protocol import (
+        ChatCompletionResponse,
+        ChatCompletionResponseChoice,
+        ChatCompletionResponseStreamChoice,
+        ChatCompletionStreamResponse,
+        UsageInfo,
+    )
 
     request_id = f'chat-{shortuuid.random()}'
     created_time = int(time.time())
@@ -147,7 +151,7 @@ def completion_stream_generator() -> Generator['ChatCompletionStreamResponse', N
         return completion_stream_generator()
 
     # Non-stream path: parse all tokens at once using parse_full
-    tokens: List[int] = []
+    tokens: list[int] = []
     for c in token_chunks:
         tokens.extend(c)
     message = parser.parse_full(tokens)
@@ -160,7 +164,7 @@ def completion_stream_generator() -> Generator['ChatCompletionStreamResponse', N
                                   usage=UsageInfo())
 
 
-def _stream_parse(request, token_chunks: List[List[int]]):
+def _stream_parse(request, token_chunks: list[list[int]]):
     from lmdeploy.serve.openai.protocol import DeltaMessage
 
     content = ''
@@ -190,7 +194,7 @@ def _stream_parse(request, token_chunks: List[List[int]]):
     return content, reasoning_content, tool_calls
 
 
-def _t(s: str) -> List[int]:
+def _t(s: str) -> list[int]:
     return [ord(c) for c in s]
 
 
@@ -223,7 +227,7 @@ def _t(s: str) -> List[int]:
 @pytest.mark.parametrize(('token_chunks', 'expects'), [
     (TOKENS_SINGLE_CALL_TWO_CHUNKS, [TestExpects('get_weather', 'Paris, France')]),
 ])
-def test_parser_stream_basic(token_chunks: List[List[int]], expects: List[TestExpects]):
+def test_parser_stream_basic(token_chunks: list[list[int]], expects: list[TestExpects]):
     from lmdeploy.serve.openai.protocol import ChatCompletionRequest
 
     _install_openai_harmony_stub()
@@ -274,7 +278,7 @@ def test_parser_stream_interleaved_channels():
     (TOKENS_TWO_CALLS_SAME_FUNC, [TestExpects('get_weather', 'Tokyo'),
                                   TestExpects('get_weather', 'Kyoto')]),
 ])
-def test_parser_stream_two_calls_same_func(token_chunks: List[List[int]], expects: List[TestExpects]):
+def test_parser_stream_two_calls_same_func(token_chunks: list[list[int]], expects: list[TestExpects]):
     from lmdeploy.serve.openai.protocol import ChatCompletionRequest
 
     _install_openai_harmony_stub()
@@ -307,7 +311,7 @@ def test_open_tool_call_no_args():
     (TOKENS_TWO_CALLS_SAME_FUNC, [TestExpects('get_weather', 'Tokyo'),
                                   TestExpects('get_weather', 'Kyoto')]),
 ])
-def test_parser_nonstream(token_chunks: List[List[int]], expects: List[TestExpects]):
+def test_parser_nonstream(token_chunks: list[list[int]], expects: list[TestExpects]):
     from lmdeploy.serve.openai.protocol import ChatCompletionRequest
 
     _install_openai_harmony_stub()
diff --git a/tests/test_lmdeploy/test_lite/test_quantization/test_utils/test_cal_qparams.py b/tests/test_lmdeploy/test_lite/test_quantization/test_utils/test_cal_qparams.py
index dadc5478e0..51f912b057 100644
--- a/tests/test_lmdeploy/test_lite/test_quantization/test_utils/test_cal_qparams.py
+++ b/tests/test_lmdeploy/test_lite/test_quantization/test_utils/test_cal_qparams.py
@@ -1,9 +1,14 @@
 # yapf: disable
 import torch
 
-from lmdeploy.lite.utils import (cal_qparams_per_channel_absmax, cal_qparams_per_channel_minmax,
-                                 cal_qparams_per_group_absmax, cal_qparams_per_group_minmax,
-                                 cal_qparams_per_tensor_absmax, cal_qparams_per_tensor_minmax)
+from lmdeploy.lite.utils import (
+    cal_qparams_per_channel_absmax,
+    cal_qparams_per_channel_minmax,
+    cal_qparams_per_group_absmax,
+    cal_qparams_per_group_minmax,
+    cal_qparams_per_tensor_absmax,
+    cal_qparams_per_tensor_minmax,
+)
 
 
 # yapf: enable
diff --git a/tests/test_lmdeploy/test_messages.py b/tests/test_lmdeploy/test_messages.py
index 1948e80b9f..1fdf73402f 100644
--- a/tests/test_lmdeploy/test_messages.py
+++ b/tests/test_lmdeploy/test_messages.py
@@ -1,4 +1,3 @@
-from typing import List
 
 import pytest
 
@@ -12,7 +11,7 @@ def test_engine_generation_config():
     stop_token_ids = tokenizer.encode('<eoa>', add_bos=False)
     config.convert_stop_bad_words_to_ids(tokenizer)
     assert stop_token_ids == config.stop_token_ids
-    assert isinstance(config.stop_token_ids, List) and \
+    assert isinstance(config.stop_token_ids, list) and \
         isinstance(config.stop_token_ids[0], int)
 
 
diff --git a/tests/test_lmdeploy/test_qwen3_parser.py b/tests/test_lmdeploy/test_qwen3_parser.py
index 3a837d73a3..b3d52b47b6 100644
--- a/tests/test_lmdeploy/test_qwen3_parser.py
+++ b/tests/test_lmdeploy/test_qwen3_parser.py
@@ -1,15 +1,23 @@
 import collections
 import json
 import time
-from typing import Generator, List, Tuple, Union
+from collections.abc import Generator
 
 import pytest
 import shortuuid
 
 from lmdeploy.serve.openai.api_server import VariableInterface
-from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice,
-                                            ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse,
-                                            ChatMessage, DeltaMessage, DeltaToolCall, UsageInfo)
+from lmdeploy.serve.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse,
+    ChatMessage,
+    DeltaMessage,
+    DeltaToolCall,
+    UsageInfo,
+)
 from lmdeploy.serve.openai.reasoning_parser.qwen_qwq_reasoning_parser import QwenQwQReasoningParser
 from lmdeploy.serve.openai.tool_parser.qwen3_parser import Qwen3ToolParser
 
@@ -18,10 +26,10 @@
 
 class DummyTokenizer:
 
-    def decode(self, token_ids: List[int]) -> str:
+    def decode(self, token_ids: list[int]) -> str:
         return ' '.join(map(str, token_ids))
 
-    def encode(self, text: str) -> List[int]:
+    def encode(self, text: str) -> list[int]:
         return [ord(c) for c in text]
 
 
@@ -174,7 +182,7 @@ def encode(self, text: str) -> List[int]:
 
 def _chat_completion_v1(
         request: ChatCompletionRequest,
-        text_sequence: List[str]) -> Union[ChatCompletionResponse, Generator[ChatCompletionStreamResponse, None, None]]:
+        text_sequence: list[str]) -> ChatCompletionResponse | Generator[ChatCompletionStreamResponse, None, None]:
     request_id = f'chat-{shortuuid.random()}'
     created_time = int(time.time())
     model_name = request.model
@@ -239,7 +247,7 @@ def completion_stream_generator() -> Generator[ChatCompletionStreamResponse, Non
     if request.tool_choice != 'none' and VariableInterface.tool_parser is not None:
         tool_call_info = VariableInterface.tool_parser.extract_tool_calls(text, request=request)
         text, tool_calls = tool_call_info.content, tool_call_info.tool_calls
-        if isinstance(tool_calls, List) and len(tool_calls):
+        if isinstance(tool_calls, list) and len(tool_calls):
             if finish_reason == 'stop':
                 finish_reason = 'tool_calls'
 
@@ -263,7 +271,7 @@ def completion_stream_generator() -> Generator[ChatCompletionStreamResponse, Non
     )
 
 
-def _stream_parse(request: ChatCompletionRequest, text_sequence: List[str]) -> Tuple[str, str, List[DeltaToolCall]]:
+def _stream_parse(request: ChatCompletionRequest, text_sequence: list[str]) -> tuple[str, str, list[DeltaToolCall]]:
     # Call parser.extract_tool_calls_streaming with delta_text specified in `DELTA_TEXT_SEQUENCE`.
     # `current_text` and `previous_text` init values and update logic
     # can be found in lmdeploy/serve/openai/api_server.py:455-523.
@@ -297,7 +305,7 @@ def _stream_parse(request: ChatCompletionRequest, text_sequence: List[str]) -> T
     (DELTA_TEXT_SEQUENCE_MULTIPLE_CALLS, [TestExpects('get_weather', '北京'),
                                           TestExpects('get_weather', '上海')]),
 ])
-def test_parser_stream(text_sequence: List[str], expects: List[TestExpects]):
+def test_parser_stream(text_sequence: list[str], expects: list[TestExpects]):
     tokenizer = DummyTokenizer()
     VariableInterface.tool_parser = Qwen3ToolParser(tokenizer=tokenizer)
     VariableInterface.reasoning_parser = QwenQwQReasoningParser(tokenizer=tokenizer)
@@ -317,7 +325,7 @@ def test_parser_stream(text_sequence: List[str], expects: List[TestExpects]):
     (DELTA_TEXT_SEQUENCE_MULTIPLE_CALLS, [TestExpects('get_weather', '北京'),
                                           TestExpects('get_weather', '上海')]),
 ])
-def test_parser_nonstream(text_sequence: List[str], expects: List[TestExpects]):
+def test_parser_nonstream(text_sequence: list[str], expects: list[TestExpects]):
     tokenizer = DummyTokenizer()
     VariableInterface.tool_parser = Qwen3ToolParser(tokenizer=tokenizer)
     VariableInterface.reasoning_parser = QwenQwQReasoningParser(tokenizer=tokenizer)
diff --git a/tests/test_lmdeploy/test_qwen3coder_parser.py b/tests/test_lmdeploy/test_qwen3coder_parser.py
index b84735a40c..b2b08d4cbb 100644
--- a/tests/test_lmdeploy/test_qwen3coder_parser.py
+++ b/tests/test_lmdeploy/test_qwen3coder_parser.py
@@ -1,15 +1,23 @@
 import collections
 import json
 import time
-from typing import Generator, List, Tuple, Union
+from collections.abc import Generator
 
 import pytest
 import shortuuid
 
 from lmdeploy.serve.openai.api_server import VariableInterface
-from lmdeploy.serve.openai.protocol import (ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice,
-                                            ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse,
-                                            ChatMessage, DeltaMessage, DeltaToolCall, UsageInfo)
+from lmdeploy.serve.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse,
+    ChatMessage,
+    DeltaMessage,
+    DeltaToolCall,
+    UsageInfo,
+)
 from lmdeploy.serve.openai.tool_parser.qwen3coder_parser import Qwen3CoderToolParser
 
 TestExpects = collections.namedtuple('TestExpects', 'func_name kwargs')
@@ -17,10 +25,10 @@
 
 class DummyTokenizer:
 
-    def decode(self, token_ids: List[int]) -> str:
+    def decode(self, token_ids: list[int]) -> str:
         return ' '.join(map(str, token_ids))
 
-    def encode(self, text: str) -> List[int]:
+    def encode(self, text: str) -> list[int]:
         return [ord(c) for c in text]
 
 
@@ -55,7 +63,7 @@ def encode(self, text: str) -> List[int]:
 
 def _chat_completion_v1(
         request: ChatCompletionRequest,
-        text_sequence: List[str]) -> Union[ChatCompletionResponse, Generator[ChatCompletionStreamResponse, None, None]]:
+        text_sequence: list[str]) -> ChatCompletionResponse | Generator[ChatCompletionStreamResponse, None, None]:
     request_id = f'chat-{shortuuid.random()}'
     created_time = int(time.time())
     model_name = request.model
@@ -121,7 +129,7 @@ def completion_stream_generator() -> Generator[ChatCompletionStreamResponse, Non
     if request.tool_choice != 'none' and has_tool:
         tool_call_info = VariableInterface.tool_parser.extract_tool_calls(text, request=request)
         text, tool_calls = tool_call_info.content, tool_call_info.tool_calls
-        if isinstance(tool_calls, List) and len(tool_calls):
+        if isinstance(tool_calls, list) and len(tool_calls):
             if finish_reason == 'stop':
                 finish_reason = 'tool_calls'
 
@@ -146,7 +154,7 @@ def completion_stream_generator() -> Generator[ChatCompletionStreamResponse, Non
     )
 
 
-def _stream_parse(request: ChatCompletionRequest, text_sequence: List[str]) -> Tuple[str, str, List[DeltaToolCall]]:
+def _stream_parse(request: ChatCompletionRequest, text_sequence: list[str]) -> tuple[str, str, list[DeltaToolCall]]:
     content = ''
     reasoning_content = ''
     tool_calls = {}
@@ -185,7 +193,7 @@ def _stream_parse(request: ChatCompletionRequest, text_sequence: List[str]) -> T
         TestExpects('get_weather', {'location': '上海'})
     ]),
 ])
-def test_parser_stream(text_sequence: List[str], expects: List[TestExpects]):
+def test_parser_stream(text_sequence: list[str], expects: list[TestExpects]):
     tokenizer = DummyTokenizer()
     VariableInterface.tool_parser = Qwen3CoderToolParser(tokenizer=tokenizer)
     VariableInterface.reasoning_parser = None
@@ -212,7 +220,7 @@ def test_parser_stream(text_sequence: List[str], expects: List[TestExpects]):
         TestExpects('get_weather', {'location': '上海'})
     ]),
 ])
-def test_parser_nonstream(text_sequence: List[str], expects: List[TestExpects]):
+def test_parser_nonstream(text_sequence: list[str], expects: list[TestExpects]):
     tokenizer = DummyTokenizer()
     VariableInterface.tool_parser = Qwen3CoderToolParser(tokenizer=tokenizer)
     VariableInterface.reasoning_parser = None
diff --git a/tests/test_lmdeploy/test_turbomind/test_converter.py b/tests/test_lmdeploy/test_turbomind/test_converter.py
index 4cbb5dc1e1..e91e0fefaa 100644
--- a/tests/test_lmdeploy/test_turbomind/test_converter.py
+++ b/tests/test_lmdeploy/test_turbomind/test_converter.py
@@ -1,8 +1,10 @@
 # yapf: disable
 from lmdeploy import TurbomindEngineConfig
 from lmdeploy.turbomind import update_parallel_config
-from lmdeploy.turbomind.deploy.converter import (get_input_model_registered_name,
-                                                 get_output_model_registered_name_and_config)
+from lmdeploy.turbomind.deploy.converter import (
+    get_input_model_registered_name,
+    get_output_model_registered_name_and_config,
+)
 from lmdeploy.turbomind.deploy.source_model.base import INPUT_MODELS
 
 # yapf: enable
diff --git a/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py b/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py
index 7dad2f4e09..1b575e574d 100644
--- a/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py
+++ b/tests/test_lmdeploy/test_vl/test_nonhf_chat_template.py
@@ -93,7 +93,8 @@ def mock_messages(self):
     @pytest.fixture(scope='module')
     def mock_IMAGE_TOKEN_messages(self):
         return [
-            dict(role='system', content='你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。'),
+            dict(role='system', content='你是书生·万象，英文名是InternVL，是由上海人工智能实验室、' \
+                    '清华大学及多家合作单位联合开发的多模态大语言模型。'),
             dict(role='user',
                  content=[
                      dict(type='text', text='<IMAGE_TOKEN>\nDescribe the following images in detail'),