2022-03-25 08:00:01

import numpy as np
import pandas as pd
import torch
import transformers as ppb # pytorch transformers
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split


df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)

model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')# 

Load pretrained model/tokenizertokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

tokenized = df[0].apply((*lambda* x: tokenizer.encode(x, add_special_tokens=True)))

T
input_ids = torch.tensor(np.array(padded))

with torch.no_grad():

last_hidden_states = model(input_ids)
 # Slice the output for the first position for all the sequences, take all hidden unit outputs
 features = last_hidden_states[0][:,0,:].numpy()
labels = df[1]
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

lr_clf.score(test_features, test_labels)

  

posted @ 2022-03-25 08:00  青竹之下  阅读(31)  评论(0编辑  收藏  举报