【Kaggle】Spam/Ham Email Classification
基本思想
需求是对垃圾邮件进行分类。
思路1:使用LSTM、GRU等自带的时序模型进行分类。
思路2:使用spacy这个NLP库,里面的textcat可直接用来文本分类
实际上,思路2比思路1更优。由于是入门题,就只使用思路1了。
思路2代码参考:https://blog.csdn.net/qq_21201267/article/details/109109237
代码实现
读取数据
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # 忽略警告提示 import warnings warnings.filterwarnings('ignore') # 导入数据 # 训练数据集 trainf = open('train.csv',encoding='utf-8') train_df =pd.read_csv(trainf) # 测试数据集 testf = open('test.csv',encoding='utf-8') test_df = pd.read_csv(testf) print('训练数据集:',train_df.shape,"测试数据集:",test_df.shape) train_df.head()
数据清洗
#数据清洗 train_df = train_df.fillna(" ") test_df = test_df.fillna(" ") print(np.sum(np.array(train_df.isnull()==True), axis=0)) print(np.sum(np.array(test_df.isnull()==True), axis=0)) from sklearn.model_selection import train_test_split # 假设 X 包含特征,y 包含目标变量 X = train_df["subject"]+" "+train_df["email"] # 使用你希望的特征列 y = train_df['spam'] # 用于预测的目标变量 # 将数据拆分为训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #文本转token from keras.preprocessing.text import Tokenizer max_words = 300 tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ') # 只给频率最高的300个词分配 id,其他的忽略 tokenizer.fit_on_texts(list(X_train)+list(X_test)) # tokenizer 训练 X_train_tokens = tokenizer.texts_to_sequences(X_train) X_test_tokens = tokenizer.texts_to_sequences(X_test) # 样本 tokens 的长度不一样,pad maxlen = 100 from keras.preprocessing import sequence X_train_tokens_pad = sequence.pad_sequences(X_train_tokens, maxlen=maxlen,padding='post') X_test_tokens_pad = sequence.pad_sequences(X_test_tokens, maxlen=maxlen,padding='post')
模型训练
#模型训练 embeddings_dim = 30 # 词嵌入向量维度 from keras.models import Model, Sequential from keras.layers import Embedding, LSTM, GRU, SimpleRNN, Dense model = Sequential() model.add(Embedding(input_dim=max_words, # Size of the vocabulary output_dim=embeddings_dim, # 词嵌入的维度 input_length=maxlen)) model.add(GRU(units=64)) # 可以改为 SimpleRNN , LSTM model.add(Dense(units=1, activation='sigmoid')) model.summary() model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # 配置模型 history = model.fit(X_train_tokens_pad, y_train, batch_size=128, epochs=10, validation_split=0.2) model.save("email_cat_lstm.h5") # 保存训练好的模型
训练过程可视化
#训练过程可视化 from matplotlib import pyplot as plt pd.DataFrame(history.history).plot(figsize=(8, 5)) plt.grid(True) plt.show()
输出答案
#输出答案 ansX=test_df["subject"]+" "+test_df["email"] # 使用你希望的特征列 tokenizer.fit_on_texts(list(X_train)+list(X_test)) # tokenizer 训练 ans_tokens = tokenizer.texts_to_sequences(ansX) ans_tokens_pad = sequence.pad_sequences(ans_tokens, maxlen=maxlen,padding='post') pred_prob = model.predict(ans_tokens_pad).squeeze() pred_class = np.asarray(pred_prob > 0.5).astype(np.int32) id = test_df['id'] output = pd.DataFrame({'id':id, 'Class': pred_class}) output.to_csv("submission_gru.csv", index=False)