Kaggle自然语言处理入门推特灾难文本分类 Natural Language Processing with Disaster Tweets

和新闻按照标题分类差不多，用的朴素贝叶斯

#导入必要的包
import random
import sys

from sklearn import model_selection
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
import joblib
import re,string
import pandas as pd
import numpy as np
def text_to_words(file_path):#将文本拆分成 词语 和 标签
    myTrain = pd.read_csv(file_path)
    sentences_arr = []
    lab_arr = list(myTrain.values[:, 4])
    for i in range(len(myTrain.values)):
        sentence = myTrain.values[i, 3].split('	')[-1].strip()  # 得到句子
        sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”！，。？、~@#￥%……&*（）《》：]+", " ", sentence)  # sub是代替，这里是把标点符号换成空格
        sentence = sentence.split(' ')
        sentences_arr.append(sentence)

    return sentences_arr, lab_arr
def load_stopwords(file_path):#创建停用词表
    stopwords = [line.strip() for line in open(file_path, encoding='UTF-8').readlines()]#line.strip()用于去除两端空格
    return stopwords
def get_dict(sentences_arr,stopswords):#生成词典
    word_dic = {}
    for sentence in sentences_arr:
        for word in sentence:
            if word != ' ' and word.isalpha():#isalpha函数用于判断字符串是否全部由字母组成
                if word not in stopswords:
                    word_dic[word] = word_dic.get(word,1) + 1
    word_dic=sorted(word_dic.items(),key=lambda x:x[1],reverse=True) #按词频序排列

    return word_dic

def get_feature_words(word_dic,word_num):#选取出现次数最多的前 word_num 个单词作为特征词
    n = 0
    feature_words = []
    for word in word_dic:
        if n < word_num:
            feature_words.append(word[0])
        n += 1
    return feature_words

# 文本特征
def get_text_features(train_data_list, test_data_list, feature_words):#根据特征词，将 训练集 和 测试集 中的句子转化为特征向量
    def text_features(text, feature_words):
        text_words = set(text)
        features = [1 if word in text_words else 0 for word in feature_words] # 形成特征向量
        return features
    train_feature_list = [text_features(text, feature_words) for text in train_data_list]
    test_feature_list = [text_features(text, feature_words) for text in test_data_list]
    return train_feature_list, test_feature_list


sentences_arr, lab_arr = text_to_words('../train.csv')#获取分词后的数据及标签
print(sentences_arr[0])

stopwords = load_stopwords('../stopwords.txt')#加载停用词
word_dic = get_dict(sentences_arr,stopwords)#生成词典
train_data_list, test_data_list, train_class_list, test_class_list = model_selection.train_test_split(sentences_arr,lab_arr,test_size=0.1)#数据集划分
feature_words =  get_feature_words(word_dic,1000)#生成特征词列表



train_feature_list,test_feature_list = get_text_features(train_data_list,test_data_list,feature_words)#生成特征向量
from sklearn.metrics import accuracy_score,classification_report

#贝叶斯分类器有五种，这里用伯努利贝叶斯是因为每个特征都是二值变量
classifier = BernoulliNB(alpha=1.0,  # 拉普拉斯平滑
                          fit_prior=True,  #否要考虑先验概率
                          class_prior=None)

print(type(train_feature_list))
print(type(train_class_list))
classifier.fit(train_feature_list, train_class_list)#进行训练

predict = classifier.predict(test_feature_list)# 在验证集上进行验证
test_accuracy = accuracy_score(predict,test_class_list)
print("准确率 accuracy_score: %.4lf"%(test_accuracy))
print("模型评估报告 Classification report for classifier:\n",classification_report(test_class_list, predict))
joblib.dump(classifier, "NewsClassification.model")

myModel = joblib.load("NewsClassification.model")

def load_sentence(sentence):
    sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”！，。？、~@#￥%……&*（）《》：]+", " ",sentence)  # sub是代替，这里是把标点符号换成空格
    sentence = sentence.split(' ')
    return sentence




p_data = 'We had a big earthquake here and many houses collapsed'
sentence = load_sentence(p_data)
sentence= [sentence]
print('分词结果:', sentence)
p_words = get_text_features(sentence,sentence,feature_words)#形成特征向量
res = myModel.predict(p_words[0])
print("所属类型：",int(res))


cnt=0
id=[]
target=[]
myTest = pd.read_csv('../test.csv')
for i in range(len(myTest.values)):
    sentence = myTest.values[i, 3].split('	')[-1].strip()  # 得到句子
    sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\')]+|[+——()?【】“”！，。？、~@#￥%……&*（）《》：]+", " ", sentence)  # sub是代替，这里是把标点符号换成空格
    sentence = sentence.split(' ')
    sentence = [sentence]
    print('分词结果:', sentence)
    p_words = get_text_features(sentence, sentence, feature_words)  # 形成特征向量
    res = myModel.predict(p_words[0])
    print("所属类型：", int(res))
    id.append(myTest.values[i, 0])
    target.append(int(res))
    cnt=cnt+1
    if cnt%1000 ==0:
        print(cnt)
myAns = pd.DataFrame({'id': id, 'target': target})
myAns.to_csv("myAns.csv", index=False, sep=',')

posted @ 2024-04-11 18:58 wljss 阅读(53) 评论(0) 编辑收藏举报

刷新页面返回顶部

wljss

悟已往之不谏，知来者之可追.

Kaggle自然语言处理入门推特灾难文本分类 Natural Language Processing with Disaster Tweets

公告

wljss

悟已往之不谏，知来者之可追.

Kaggle自然语言处理入门 推特灾难文本分类 Natural Language Processing with Disaster Tweets

公告

Kaggle自然语言处理入门推特灾难文本分类 Natural Language Processing with Disaster Tweets