在文本分类和文本相似度匹配中,经常用预训练语言模型BERT来得到句子的表示向量,下面给出了pytorch环境下的操作的方法:
- 这里使用huggingface的transformers中BERT, 需要先安装该依赖包(pip install transformers)
- 具体实现如下:import torchfrom tqdm import tqdm
from tqdm import tqdm
import torch
import joblib import numpy as np from torch.utils.data import DataLoader,Dataset from sklearn.datasets import fetch_20newsgroups from transformers import BertTokenizer,BertModel class NewDataset(Dataset): def __init__(self, bert_train, mask_train=None, seg_ids_train=None): self.bert_train = bert_train self.mask_train = mask_train self.seg_ids_train = seg_ids_train def __getitem__(self, i): return torch.LongTensor(self.bert_train[i]), \ torch.LongTensor(self.mask_train[i]), \ torch.LongTensor(self.seg_ids_train[i]) def __len__(self): return len(self.bert_train) newsgroups_train = fetch_20newsgroups(subset='train').data newsgroups_test = fetch_20newsgroups(subset='test').data train_label = fetch_20newsgroups(subset='train').target test_label = fetch_20newsgroups(subset='test').target L=512 N = len(newsgroups_train) bert_train,mask_train,seg_ids_train = [], [],[] all_sents = newsgroups_train+newsgroups_test tokenizer=BertTokenizer.from_pretrained('bert-base-uncased') for sent in tqdm(all_sents): tokens = tokenizer.tokenize(sent) tokens = ['[CLS]'] + tokens + ['[SEP]'] padded_tokens = tokens[:L] + ['[PAD]' for _ in range(L - len(tokens))] attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens] sent_ids = tokenizer.convert_tokens_to_ids(padded_tokens) seg_ids = [0 for _ in range(len(padded_tokens))] bert_train.append(sent_ids) mask_train.append(attn_mask) seg_ids_train.append(seg_ids) torch.device("cuda" if torch.cuda.is_available() else "cpu") device = "cuda:0" data = NewDataset(bert_train,mask_train=mask_train,seg_ids_train=seg_ids_train) bert_model = BertModel.from_pretrained('bert-base-uncased').to(device) reps = [] batchsize = 5 for batch in tqdm(DataLoader(data, shuffle=False, batch_size=batchsize)): bert_train, mask_train, seg_ids_train = batch #hidden_reps, cls_head = bert_model(bert_train.cuda(), attention_mask=mask_train.cuda(), token_type_ids=seg_ids_train.cuda()) #reps+=list(cls_head.detach().cpu().numpy()) output = bert_model(bert_train.cuda(), attention_mask=mask_train.cuda(), token_type_ids=seg_ids_train.cuda()) reps+=list(output.pooler_output.detach().cpu().numpy()) reps_train = reps[:N] reps_test = reps[N:] newsgroups_data = {'train_vecs': reps_train, 'train_label': train_label, 'test_vecs': reps_test,'test_label': test_label} joblib.dump(newsgroups_data,"newsgroups_data.pkl")
下面供参考,采用prompt获取句子的表示,提取[MASK]位置的特征向量作为句子特征:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import torch import joblib import numpy as np from tqdm import tqdm from torch.utils.data import DataLoader,Dataset from sklearn.datasets import fetch_20newsgroups from transformers import BertTokenizer,BertModel class NewDataset(Dataset): def __init__( self , bert_train, mask_train = None , seg_ids_train = None ): self .bert_train = bert_train self .mask_train = mask_train self .seg_ids_train = seg_ids_train def __getitem__( self , i): return torch.LongTensor( self .bert_train[i]), \ torch.LongTensor( self .mask_train[i]), \ torch.LongTensor( self .seg_ids_train[i]) def __len__( self ): return len ( self .bert_train) newsgroups_train = fetch_20newsgroups(subset = 'train' ).data newsgroups_test = fetch_20newsgroups(subset = 'test' ).data train_label = fetch_20newsgroups(subset = 'train' ).target test_label = fetch_20newsgroups(subset = 'test' ).target L = 512 N = len (newsgroups_train) bert_train,mask_train,seg_ids_train = [], [],[] all_sents = newsgroups_train + newsgroups_test tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased' ) prompt = "The sentence's topic is [MASK]." prompt_tokens = tokenizer.tokenize(prompt) LP = len (prompt_tokens) for sent in tqdm(all_sents): tokens = tokenizer.tokenize(sent) tokens = [ '[CLS]' ] + tokens + [ '[SEP]' ] padded_tokens = tokens[:L - LP] + prompt_tokens + [ '[PAD]' for _ in range (L - LP - len (tokens))] attn_mask = [ 1 if token ! = '[PAD]' else 0 for token in padded_tokens] sent_ids = tokenizer.convert_tokens_to_ids(padded_tokens) seg_ids = [ 0 for _ in range ( len (padded_tokens))] bert_train.append(sent_ids) mask_train.append(attn_mask) seg_ids_train.append(seg_ids) torch.device( "cuda" if torch.cuda.is_available() else "cpu" ) device = "cuda:0" data = NewDataset(bert_train,mask_train = mask_train,seg_ids_train = seg_ids_train) bert_model = BertModel.from_pretrained( 'bert-base-uncased' ).to(device) reps = [] batchsize = 5 for batch in tqdm(DataLoader(data, shuffle = False , batch_size = batchsize)): bert_train, mask_train, seg_ids_train = batch #hidden_reps, cls_head = bert_model(bert_train.cuda(), attention_mask=mask_train.cuda(), token_type_ids=seg_ids_train.cuda()) #reps+=list(cls_head.detach().cpu().numpy()) output = bert_model(bert_train.cuda(), attention_mask = mask_train.cuda(), token_type_ids = seg_ids_train.cuda()) # reps+=list(output.pooler_output.detach().cpu().numpy()) last_hidden_state = output.last_hidden_state[bert_train = = 103 ] #103是[MASK]的id, shape 是 (batchsize, hiddensize) reps + = list (last_hidden_state.detach().cpu().numpy()) reps_train = reps[:N] reps_test = reps[N:] newsgroups_data = { 'train_vecs' : reps_train, 'train_label' : train_label, 'test_vecs' : reps_test, 'test_label' : test_label} joblib.dump(newsgroups_data, "newsgroups_data.pkl" ) |
标签:
python
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· 没有Manus邀请码?试试免邀请码的MGX或者开源的OpenManus吧