NLTK文本分类入门——判别谣言
前言
今天分享一个nlp任务的实例:给定一句话,要求判别是否为谣言。这里只使用NLTK等工具(简单实用),不涉及deep learning的相关内容,所以是初学者友好型实例。本文包括了数据预处理、模型搭建以及预测评估三个方面,使用的算法主要为Naive Bayes和Logistic Regression,算法本身的理论知识这里就不展开了(网上非常多),不了解的小朋友先做一个preview再回来吧。另外,这个实例是完全可以在本地完成的,不需要用到gpu等资源,运行速度也是比较快的,放心食用就好,下面我们就一步步通过代码来完成它。
预处理
- 下载数据到本地
import requests
import os
from pathlib import Path
fname = 'rumour-data.tgz'
data_dir = os.path.splitext(fname)[0] #'rumour-data'
my_file = Path(fname)
if not my_file.is_file():
url = "https://github.com/jhlau/jhlau.github.io/blob/master/files/rumour-data.tgz?raw=true"
r = requests.get(url)
#Save to the current directory
with open(fname, 'wb') as f:
f.write(r.content)
print("Done. File downloaded:", my_file)
- 解压缩文件
import tarfile
#decompress rumour-data.tgz
tar = tarfile.open(fname, "r:gz")
tar.extractall()
tar.close()
#remove superfluous files (e.g. .DS_store)
extra_files = []
for r, d, f in os.walk(data_dir):
for file in f:
if (file.startswith(".")):
extra_files.append(os.path.join(r, file))
for f in extra_files:
os.remove(f)
print("Extraction done.")
然后你可以在本地看到数据文件了,文件的结构大致是这样
rumour-data
- rumours
- 498254340310966273
- reactions
- 498254340310966273.json
- 498260814487642112.json
- source-tweet
- 498254340310966273.json
- non-rumours
- 试用json包来解析数据
def get_tweet_text_from_json(file_path):
with open(file_path) as json_file:
data = json.load(json_file)
return data["text"]
def get_events(event_dir):
event_list = []
for event in sorted(os.listdir(event_dir)):
tweet_list = []
event_path = os.path.join(event_dir,event)
for root,dirs,files in os.walk(event_path):
for f in files:
file_path = os.path.join(root,f)
tweet = get_tweet_text_from_json(file_path)
tweet_list.append(tweet)
event_list.append(tweet_list)
return event_list
#a list of events, and each event is a list of tweets (source tweet + reactions)
rumour_events = get_events(os.path.join(data_dir, "rumours"))
nonrumour_events = get_events(os.path.join(data_dir, "non-rumours"))
print("Number of rumour events =", len(rumour_events))
print("Number of non-rumour events =", len(nonrumour_events))
- 建立bag of words(bow),并顺便剔除stopwords
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from collections import defaultdict
tt = TweetTokenizer()
stopwords = set(stopwords.words('english'))
def preprocess_events(events):
preprocessed_event_list = []
for event in events:
preprocessed_event = []
word_freq = {}
for tweet in event:
word_tokens = tt.tokenize(tweet)
for word in word_tokens:
if word.lower() in stopwords:
continue
if word not in word_freq:
word_freq[word] = 0
word_freq[word] += 1
preprocessed_event_list.append(word_freq)
return preprocessed_event_list
preprocessed_rumour_events = preprocess_events(rumour_events)
preprocessed_nonrumour_events = preprocess_events(nonrumour_events)
print("Number of preprocessed rumour events =", len(preprocessed_rumour_events))
print("Number of preprocessed non-rumour events =", len(preprocessed_nonrumour_events))
- 找到所有hashtags
def get_all_hashtags(events):
hashtags = set([])
for event in events:
for word, frequency in event.items():
if word.startswith("#"):
hashtags.add(word)
return hashtags
hashtags = get_all_hashtags(preprocessed_rumour_events + preprocessed_nonrumour_events)
print("Number of hashtags =", len(hashtags))
- 利用reversed MaxMatch算法拆分hashtags
from nltk.corpus import wordnet
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
words = set(nltk.corpus.words.words()) #a list of words provided by NLTK
def tokenize_hashtags(hashtags):
def get_tag(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
res = {}
for hashtag in hashtags:
key = hashtag
tokens = []
while len(hashtag) > 0:
for i in range(len(hashtag)):
word = hashtag[i:]
word_pos_tag = nltk.tag.pos_tag([word.lower()])[0][1]
lemmatized_word = lemmatizer.lemmatize(word.lower(),get_tag(word_pos_tag))
if lemmatized_word in words:
hashtag = hashtag[:i]
tokens.append(word)
break
else:
tokens.append(word)
hashtag = hashtag[:i]
res[key] = tokens[::-1]
return res
tokenized_hashtags = tokenize_hashtags(hashtags)
print(list(tokenized_hashtags.items())[:20])
- 更新我们的bow
def update_event_bow(events):
for event in events:
for word in list(event.keys()):
if word.startswith('#') and word in tokenized_hashtags:
subtokens = tokenized_hashtags[word]
for token in subtokens:
if token in event:
event[token] += event[word]
else:
event[token] = 1
update_event_bow(preprocessed_rumour_events)
update_event_bow(preprocessed_nonrumour_events)
print("Number of preprocessed rumour events =", len(preprocessed_rumour_events))
print("Number of preprocessed non-rumour events =", len(preprocessed_nonrumour_events))
模型训练评估
- 构造训练集,评估集,测试集
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
vectorizer = DictVectorizer()
rumour_size = len(preprocessed_rumour_events)
nonrumour_size = len(preprocessed_nonrumour_events)
all_events = preprocessed_rumour_events + preprocessed_nonrumour_events
all_labels = [1] * rumour_size + [0] * nonrumour_size
train_data, dev_test_data, train_label, dev_test_label \
= train_test_split(all_events,all_labels,stratify = all_labels,test_size=0.4,random_state=12345)
dev_data, test_data, dev_label, test_label \
= train_test_split(dev_test_data,dev_test_label,stratify = dev_test_label,test_size=0.5,random_state=12345)
train_set = vectorizer.fit_transform(train_data)
dev_set = vectorizer.transform(dev_data)
test_set = vectorizer.transform(test_data)
#print(len(train_data),len(dev_data),len(test_data))
#print(len(train_label),len(dev_label),len(test_label))
print("Vocabulary size =", len(vectorizer.vocabulary_))
- 分别用naive bayes和logistic regression进行训练,并利用dev集手动调参
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score
# apply Naive Bayes
print("For Naive Bayes model:")
alpha = 0.78
NB_classifier = MultinomialNB(alpha=alpha)
NB_classifier.fit(train_set,train_label)
dev_NB_pred = NB_classifier.predict(dev_set)
acc_NB_dev = accuracy_score(dev_label,dev_NB_pred)
print("When choosing alpha(additive smoothing parameter) =", alpha, ", the accuracy =", acc_NB_dev)
# apply Logistic Regression
print("\nFor Logistic Regression model:")
C = 0.07
solver='liblinear'
LR_classifier = LogisticRegression(C=C,solver=solver)
LR_classifier.fit(train_set,train_label)
dev_LR_pred = LR_classifier.predict(dev_set)
acc_LR_dev = accuracy_score(dev_label,dev_LR_pred)
print("When choosing",solver, "as solver and C(inverse of regularization strength) =", C, ", the accuracy =", acc_LR_dev)
- 评估模型(acc & macro-averaged F-score)
# for Naive Bayes
print("For Naive Bayes model:")
test_NB_pred = NB_classifier.predict(test_set)
acc_NB_test = accuracy_score(test_label,test_NB_pred)
f1_NB_test = f1_score(test_label,test_NB_pred,average='macro')
print("the accuracy =",acc_NB_test)
print("the macro-averaged F-score =",f1_NB_test)
# for Logistic Regression
print("\nFor Logistic Regression model:")
test_LR_pred = LR_classifier.predict(test_set)
acc_LR_test = accuracy_score(test_label,test_LR_pred)
f1_LR_test = f1_score(test_label,test_LR_pred,average='macro')
print("the accuracy =",acc_LR_test)
print("the macro-averaged F-score =",f1_LR_test)
最后从结果看,两种方法对这种简单文本的分类还是有不错的效果的。
For Naive Bayes model:
the accuracy = 0.8
the macro-averaged F-score = 0.7738579828132066
For Logistic Regression model:
the accuracy = 0.7866666666666666
the macro-averaged F-score = 0.7491638795986622