作业12

text = '''The world is changing
and time is spinning fast
it's so amazing how you came into my life
I know it seems all hope is gone
I know you feel you can't be strong
and once again the story ends with you and I
And anytime you feel like you just can't go on
just hold on to my love
and you'll never be alone
Hold on
we can make it through the fire
and my love   '''

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#预处理
def preprocessing(text):
    #text=text.decode("utf-8)
    tokens=[word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    stops=stopwords.words('english')
    tokens=[token for token in tokens if token not in stops]

    tokens=[token.lower() for token in tokens if len(token)>=3]
    lmtzr=WordNetLemmatizer()
    tokens=[lmtzr.lemmatize(token) for token in tokens]
    preprocessed_text=' '.join(tokens)
    return preprocessed_text
preprocessing (text)

#读取数据集
import csv
file_path=r'D:\SMSSpamCollectionjs.txt'
sms=open(file_path,'r',encoding='utf-8')
sms_data=[]
sms_label=[]
csv_rreader=csv.reader(sms,delimiter='\t')
for line in csv_reader:
    sms_label.append(line[0])
    sms_data.append(preprocessing(line[1]))
sms.close()

#按0.7,0.3比例分为训练集和测试集
import numpy as np
sms_data=np.array(sms_data)
sms_label=np.array(sms_label)

from sklearn.model_selection import train_text_split
x_train, x_test, y_train, y_test = train_text_split(sms_data, sms_label, test_size=0.3, random_state=0, stratify=sms_label)

#将其向量化
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(min_df=2,ngram_range=(1,2),stop_words='english',strip_accents='unicode',norm='l2')
X_train=vectorizer.fit_transform(x_train)
X_test=vectorizer.transform(x_test)

X_train
a=X_train.toarray()
print(a)

for i in range(1000):
    for j in range(5984):
        if a[i,j]!=0:
            print(i,j,a[i,j])

#朴素贝叶斯分类器
from sklearn.navie_bayes import MultionmialNB
clf= MultionmialNB().fit(X_train,y_train)
y_nb_pred=clf.predict(X_test)

#分类结果显示
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(y_nb_pred.shape, y_nb_pred) #x_test预测结果

#
from sklearn.metrics import classification_report
cr = classification_report(y_nb_pred,y_test)
print(cr)

 

posted @ 2018-12-05 23:10  傻猪一号  阅读(121)  评论(0编辑  收藏  举报