Bert实现情感分析demo

Bert实现情感分析demo

数据集

IMDB数据集.

代码以及部分讲解

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

没有cuda就启用cpu

class CustomClassifier(nn.Module):
    def __init__(self,bert):
        super(CustomClassifier,self).__init__()
        self.bert = bert
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,2)
        self.dropout = nn.Dropout(0.1) #防止泛化
        self.relu = nn.ReLU() #激活函数,防止梯度消失。
        self.softmax = nn.LogSoftmax(dim=1) 

    def forward(self,input_ids,attention_mask,labels = None):
        _, cls_hidden_state = self.bert(input_ids, attention_mask=attention_mask, return_dict=False) #得到隐层状态,可以继续前向传播
        x = self.fc1(cls_hidden_state)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        if labels is not None:
            loss_fn = nn.NLLLoss() #NLLLOSS损失函数和LogSoftmax绑定使用
            loss = loss_fn(x,labels)
            return loss,x
        return x


    def save(self,save_directory):
        os.makedirs(save_directory,exist_ok=True)
        torch.sava(self.state_dict(),os.path.join(save_directory,"pytorch_model.bin"))
        self.bert.config.to_json_file(os.path.join(save_directory,"config.json"))

    @classmethod #主要可以通过cls使用类的方法
    def from_pretrained(cls, save_directory):
        bert_model = AutoModel.from_pretrained(save_directory)
        model = cls(bert_model)
        model.load_state_dict(torch.load(os.path.join(save_directory,'pytorch_model.bin')))
        return model

这是主要的类,有关参数可以进行微调。
这个demo使用模型是bert-base,预训练模型以及分词器在huggingface上下载。

主要下载tokenizer_config.json,vocab.txt,config.json,pytorch_model.bin文件
BERT Base:

  • 隐藏层(Transformer 层)数量:12 层
  • 隐藏状态维度:768
  • 自注意力头数量:12
  • 总参数数量:110M
def load_local_dataset(data_dir):
    data = {'train': [], 'test': []}
    labels = {'neg': 0, 'pos': 1}
    
    for split in ['train', 'test']:
        for label in ['neg', 'pos']:
            folder = os.path.join(data_dir, split, label)
            for filename in tqdm(os.listdir(folder), desc=f"Loading {split} {label} data"):
                with open(os.path.join(folder, filename), 'r', encoding='utf-8') as f:
                    data[split].append({'text': f.read(), 'label': labels[label]})
    
    train_dataset = Dataset.from_pandas(pd.DataFrame(data['train']))
    test_dataset = Dataset.from_pandas(pd.DataFrame(data['test']))
    return DatasetDict({'train': train_dataset, 'test': test_dataset})

加载预训练数据,转化为dataset格式方便输入。

data_dir = './imdb/imdb'  # 替换为你的数据路径
dataset = load_local_dataset(data_dir)
model_name = "./bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)
#加载分词器和预训练模型
# 创建自定义分类模型
model = CustomClassifier(bert_model).to(device)
def preprocess_function(examples):
    return tokenizer(examples['text'],truncation=True,padding=True,max_length=512)

encoded_dataset = dataset.map(preprocess_function,batch = True,desc="Tokenizing")

对数据进行预处理,截断,填充等。

trainingargs = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch"
)

定义训练参数

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

定义评估函数,可以返回精确率,召回率,f1分数等。

trainer = Trainer(
    model = model,
    args = trainingargs,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()
model.save('./saved_model')
tokenizer.save_pretrained('./saved_model')

模型的训练和保存

model = CustomClassifier.from_pretrained('./saved_model')
model.to(device)

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained('./saved_model')

# 定义预测函数
def predict(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs, dim=1)  
    label = torch.argmax(probs, dim=1).item()
    return 'pos' if label == 1 else 'neg'

# 进行预测
sample_text = "I failed the math exam this time."
prediction = predict(sample_text)
print(f"Prediction: {prediction}")

预测结果

训练大概需要30~50分钟
img
img

posted @ 2024-07-13 17:12  Sun-Wind  阅读(70)  评论(0编辑  收藏  举报