2022-03-25 08:00:01
import numpy as np import pandas as pd import torch import transformers as ppb # pytorch transformers from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None) model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased') ## Want BERT instead of distilBERT? Uncomment the following line: #model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')# Load pretrained model/tokenizertokenizer = tokenizer_class.from_pretrained(pretrained_weights) model = model_class.from_pretrained(pretrained_weights) tokenized = df[0].apply((*lambda* x: tokenizer.encode(x, add_special_tokens=True))) T input_ids = torch.tensor(np.array(padded)) with torch.no_grad(): last_hidden_states = model(input_ids) # Slice the output for the first position for all the sequences, take all hidden unit outputs features = last_hidden_states[0][:,0,:].numpy() labels = df[1] train_features, test_features, train_labels, test_labels = train_test_split(features, labels) lr_clf = LogisticRegression() lr_clf.fit(train_features, train_labels) lr_clf.score(test_features, test_labels)