lora微调qwen2-0.5B
我们利用复旦新闻数据集进行训练:
git clone https://www.modelscope.cn/datasets/swift/zh_cls_fudan-news.git
git clone https://www.modelscope.cn/qwen/Qwen2-0.5B.git
安装库
pip install datasets
pip install peft
from datasets import Dataset
import pandas as pd
import json
import pandas as pd
import torch
from datasets import Dataset
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import os
# 权重根目录
BASE_DIR = '/content'
# 设备名称
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 数据集处理函数,包括:训练数据集和测试数据集
def dataset_jsonl_transfer(origin_path, new_path):
"""
将原始数据集转换为大模型微调所需数据格式的新数据集
"""
messages = []
# 读取原JSONL文件
with open(origin_path, "r", encoding="utf-8") as file:
# (数据集筛选)每20行取一行
i = 0
for line in file:
i += 1
if i % 20 != 0:
continue
# 解析每一行原始数据(每一行均是一个JSON格式)
data = json.loads(line)
text = data["text"]
catagory = data["category"]
output = data["output"]
message = {
"input": f"文本:{text},分类选项列表:{catagory}",
"output": output,
}
messages.append(message)
# 保存处理后的JSONL文件,每行也是一个JSON格式
with open(new_path, "w", encoding="utf-8") as file:
for message in messages:
file.write(json.dumps(message, ensure_ascii=False) + "\n")
# 在使用数据集训练大模型之前,对每行数据进行预处理
def process_func(example):
"""
将数据集进行预处理
"""
MAX_LENGTH = 384
input_ids, attention_mask, labels = [], [], []
instruction = tokenizer(f"<|im_start|>system\n你是一个文本分类领域的专家,你会接收到一段文本和几个潜在的分类选项列表,请输出文本内容的正确分类<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False) # add_special_tokens 不在开头加 special_tokens
response = tokenizer(f"{example['output']}", add_special_tokens=False)
input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1] # 因为eos token咱们也是要关注的所以 补充为1
labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
if len(input_ids) > MAX_LENGTH: # 做一个截断
input_ids = input_ids[:MAX_LENGTH]
attention_mask = attention_mask[:MAX_LENGTH]
labels = labels[:MAX_LENGTH]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels
}
# 加载预训练模型和分词器
model_dir = os.path.join(BASE_DIR, 'Qwen2-0.5B')
tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map=device, torch_dtype=torch.bfloat16)
model.enable_input_require_grads() # 开启梯度检查点时,要执行该方法
# 加载、处理数据集和测试集
train_dataset_path = os.path.join(BASE_DIR, 'zh_cls_fudan-news', 'train.jsonl')
test_dataset_path = os.path.join(BASE_DIR, 'zh_cls_fudan-news', 'test.jsonl')
train_jsonl_new_path = os.path.join(BASE_DIR, 'train.jsonl')
test_jsonl_new_path = os.path.join(BASE_DIR, 'test.jsonl')
if not os.path.exists(train_jsonl_new_path):
dataset_jsonl_transfer(train_dataset_path, train_jsonl_new_path)
if not os.path.exists(test_jsonl_new_path):
dataset_jsonl_transfer(test_dataset_path, test_jsonl_new_path)
# 得到微调数据集
train_df = pd.read_json(train_jsonl_new_path, lines=True)
train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)
# 创建LoRA配置
config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
inference_mode=False, # 训练模式
r=8, # Lora 秩
lora_alpha=32, # Lora alaph,具体作用参见 Lora 原理
lora_dropout=0.1, # Dropout 比例
)
# 将LoRA应用于模型
model = get_peft_model(model, config)
# 创建微调参数
args = TrainingArguments(
output_dir=os.path.join(BASE_DIR, 'output', 'Qwen2-0.5B'),
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
logging_steps=10,
num_train_epochs=2,
save_steps=100,
learning_rate=1e-4,
save_on_each_node=True,
gradient_checkpointing=True,
report_to="none",
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
)
# 开始微调
trainer.train()
# 模型评估:获取测试集的前3条测试数据
test_df = pd.read_json(test_jsonl_new_path, lines=True)[:3]
test_text_list = []
for index, row in test_df.iterrows():
instruction = '你是一个文本分类领域的专家,你会接收到一段文本和几个潜在的分类选项列表,请输出文本内容的正确分类'
input_value = row['input']
messages = [
{"role": "system", "content": f"{instruction}"},
{"role": "user", "content": f"{input_value}"}
]
response = predict(messages, model, tokenizer)
messages.append({"role": "assistant", "content": f"{response}"})
result_text = f"{messages[0]}\n\n{messages[1]}\n\n{messages[2]}"
test_text_list.append(result_text)
模型推理:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
instruction = '你是一个文本分类领域的专家,你会接收到一段文本和几个潜在的分类选项列表,请输出文本内容的正确分类'
input_value = row['input']
messages = [
{"role": "system", "content": f"{instruction}"},
{"role": "user", "content": f"自动化学报ACTA AUTOMATICA SINICA1997年 第23卷 第6期 Vol.23 No.6 1997两步H∞辨识算法的一个近似最优的误差上界王书宁摘要 "}
]
response_message = pipe(messages, max_new_tokens=512)[0]["generated_text"][-1]
模型保存:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, GenerationConfig
from peft import PeftModel
# 载入预训练模型
_model = AutoModelForCausalLM.from_pretrained(model_dir)
peft_model = PeftModel.from_pretrained(model=_model ,model_id="output/Qwen2/checkpoint-100")
# 模型合并
peft_model = peft_model .merge_and_unload()
# 保存
peft_model.save_pretrained("trained_model")
可选Qlora训练方式:
from transforms import BitsAndBytesConfig
#量化参数
bnb_config = BitsAndBytesConfig(load_in_4bit=True,
bnb_4bit_use_double_quant=True, #对权重与激活函数进行量化
bnb_4bit_quant_type='nf4',
bnb_4bit_compute_dtype=torch.float32)
model = AutoModelForCausalLM.from_pretrained(model_dir,low_cpu_mem_usage=True,quantization_config=bnb_config)
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek “源神”启动!「GitHub 热点速览」
· 我与微信审核的“相爱相杀”看个人小程序副业
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 如何使用 Uni-app 实现视频聊天(源码,支持安卓、iOS)
· C# 集成 DeepSeek 模型实现 AI 私有化(本地部署与 API 调用教程)