SoilDetection/TryFinetuningModel.py at main · lyheiyu/SoilDetection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from peft import PeftModel, PeftConfig
# import torch
from transformers import BitsAndBytesConfig
#
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM
# #
# # tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# # model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
# # 设置路径
# base_model_path = "./Meta_Llama_3_8B2"  # 例如 "./llama-3-12b"
# finetuned_model_path = "./llama3_lora_finetuned"
#
# # 加载基础模型
# print("Loading base model...")
# bnb_config = BitsAndBytesConfig(load_in_8bit=True)
# model = AutoModelForCausalLM.from_pretrained(
#     base_model_path,
#     # device_map={"": "cpu"},
#     device_map="auto",
#     offload_folder="offload",
#     quantization_config=bnb_config,
#     # no_split_module_classes=["LlamaDecoderLayer"]
# )
# # 加载微调后的 LoRA 权重
# print("Loading fine-tuned LoRA weights...")
# # peft_config = PeftConfig.from_pretrained(finetuned_model_path)
# # model = PeftModel.from_pretrained(model, finetuned_model_path)
#
# # 加载分词器
# tokenizer = AutoTokenizer.from_pretrained(base_model_path)
#
# # 测试生成函数
# def generate_response(prompt, model, tokenizer, max_new_tokens=50):
#     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens=max_new_tokens,
#             do_sample=True,
#             temperature=0.7,
#             top_k=50,
#             top_p=0.9,
#             pad_token_id=tokenizer.pad_token_id
#         )
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)
#
# # 示例测试
# # prompt = "who are you?"
# # # response = generate_response(prompt, model, tokenizer)
# # print("Prompt:", prompt)
# # print("Response:", response)
# input_text='who are you?'
# inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
# outputs = model.generate(**inputs, max_new_tokens=50)
#
#
# print("Generated text:")
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel
# 设置原始模型路径
base_model_path = "./Meta_Llama_3_8B2"  # 修改为你的实际模型路径
# finetuned_model_path = "./llama3_lora_finetuned"
# finetuned_model_path = "./lora_output3"

finetuned_model_path = "./lora_function_calling"
# 加载基础模型
print("Loading base model...")
# model = AutoModelForCausalLM.from_pretrained(
#     base_model_path,
#     device_map="auto"  # 自动分配设备
# )
bnb_config = BitsAndBytesConfig(load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    # device_map={"": "cpu"},
    device_map="auto",
    offload_folder="offload",
    quantization_config=bnb_config,
    # no_split_module_classes=["LlamaDecoderLayer"]
)

print("Loading fine-tuned LoRA weights...")
model = PeftModel.from_pretrained(model, finetuned_model_path, is_trainable=False)

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)
# 加载分词器
# tokenizer = AutoTokenizer.from_pretrained(base_model_path)
print(model.peft_config["default"])
# 定义测试函数
def generate_response(prompt, model, tokenizer, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)  # 将输入转移到模型所在设备
    with torch.no_grad():  # 禁用梯度计算以节省内存
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,  # 最大生成的 token 数
            do_sample=True,                # 随机采样模式
            temperature=0.5,               # 控制生成的随机性
            top_k=50,                      # 限制前 k 个最高概率的词
            top_p=0.9,                     # 基于累积概率进行采样
            pad_token_id=tokenizer.pad_token_id  # 填充的 token ID
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)  # 解码生成的文本

# 测试输入
prompt = "What is the phosphorus content in Tokyo, Japan?"
response = generate_response(prompt, model, tokenizer)
print("Prompt:", prompt)
print("Response:", response)