python_sklearn预测真假新闻(pandas读入两份csv文件)

文章目录

python code:
result:

python code:

使用tf-idf 提取特征向量
(语料库/文件集合中各个文件/新闻词条的特征向量来做分类(比如真假)

 from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd
import logging as l
l.basicConfig(level=l.DEBUG)
prefix = "./exp8/"
news_train = 'news_train.csv'  
news_train_subset = "news_train_subset.csv"
news_test = "news_test.csv"
 
''' use the small scale input to test the process  '''
# news_train = news_train_subset
 
# 读取原始数据，指定UTF-8编码
news_train_df = pd.read_csv(prefix + news_train, encoding='utf-8')
news_test_df = pd.read_csv(prefix+news_test, encoding="utf-8")
 
 
x_train = news_train_df['text']
len_train=len(x_train)
# print(len_train)
x_test=news_test_df['text']
# print(x_test)
# x=pd.merge(x,x_test,how="cross")
''' concat the two files 
because the tf-idf algorithm reference to the corpus,so if you want to predict the news' validity ,you should create a corpus base all sentences'''
x_whole=pd.concat([x_train,x_test])
 
# print("new shape of x:")
    # print(news_train_series)
    # l.debug(f"{x}")
y = news_train_df['label']
#get the numeric label vector
y=[1 if validity=='REAL'  else 0 for validity in y  ]
 
# news_test_series = news_test_df['text']
    # print(label_train_series)
    # label_test_series=news_test_df['label']#this is waiting for solve(predict)
 
    # news_train_series = np.array(news_train_series)
    # print(news_train_series.shape)
    # print(news_train_series)
    # label_train_series = np.array(label_train_series)
    # print(label_train_nd[1])
 
corpus = x_whole
# corpus_test=x_test
vectorizer = TfidfVectorizer()
# vectorizer_test=TfidfVectorizer()
v = vectorizer.fit_transform(corpus)
# v_test=vectorizer_test.fit_transform(corpus_test)
#get numeric x vector(ndarry)
x_whole_vectors=v.toarray()
# print(x_whole_vectors)
x_train_vectors=x_whole_vectors[:len_train]
x_test_vectors=x_whole_vectors[len_train:]
 
 
clf_GNB = GaussianNB()
y_test_estimate_real=[]
def estimate_accuracy():
    ''' estimate the accuracy: '''
    estimate_number = int(0.9*len_train)
    x_estimate = x_whole_vectors[:estimate_number]
    y_estimate = y[:estimate_number]
    # #fix the shape
    x_test_estimate = x_whole_vectors[estimate_number:len_train]
    global y_test_estimate_real
    y_test_estimate_real = y[estimate_number:len_train]
    ''' estimate clf and operation:,if you want predict the finally result,the comment the  following lines '''
    clf_GNB.fit(x_estimate,y_estimate)
    global y_predict
    y_predict=clf_GNB.predict(x_test_estimate)
 
''' # change the references to estimate '''
# x = x_estimate
# y = y_estimate
y_predict=[]
def predict():
    ''' the ultimately prediction classifer and predict operation '''
    clf_GNB.fit(x_train_vectors, y)
    global y_predict
    y_predict = clf_GNB.predict(x_test_vectors)
 
 
print("prediction:")
def output_prediction():
    with open(prefix+"pred.txt","w") as fos:
        for validity in y_predict:
            if validity==1:
                fos.write("REAL\n")
                print("REAL",end=" ") 
            else:
                fos.write("FAKE\n")
                print("FAKE",end=" ") 
        print()
 
estimate_accuracy()
# predict()
output_prediction()
 
def print_for_estimate_accuracy():
    ''' begin estimate the prediction accuracy:80% or so '''
    count=0
    for i,j in zip(y_test_estimate_real,y_predict):
        if i!=j:
            count+=1
            # print(i,j)
    predicts_numbers=len(y_predict)
    print(predicts_numbers)
    print((predicts_numbers-count)/predicts_numbers)
print_for_estimate_accuracy()

result:

在这里插入图片描述

posted @ 2024-07-26 17:03 xuchaoxin1375 阅读(5) 评论(0) 编辑收藏举报来源

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· sklearn_分类预测(KNN,GNB,...)示例(疾病简单二分类)

· python_简单蛋白质功能二分类预测(sklearn:GNB)

· 【机器学习实战入门】识别假新闻

· 自然语言处理秘籍-全-

· K-近邻（KNN）的python实现

阅读排行：
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码，我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了，比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· 白话解读 Dapr 1.15：你的「微服务管家」又秀新绝活了

历史上的今天：
2023-07-26 相邻组合问题和分组组合问题
2022-07-26 有一串打乱的成绩，现要求你把他们从低到高排列。
2022-07-26 现有一串已按分数高低记录好的成绩，现在要求你在不打乱原来顺序的基础上插入一部分数据、删除一部分数据，并输出。

公告

昵称： xuchaoxin1375
园龄： 4年10个月
粉丝： 1
关注： 0

+加关注

2025年3月

日

一

二

三

四

五

六

xuchaoxin1375

python_sklearn预测真假新闻(pandas读入两份csv文件)

文章目录

python code:

result:

公告

搜索

常用链接

随笔档案

阅读排行榜

推荐排行榜

	from openpyxl import Workbook
	from openpyxl.utils.dataframe import dataframe_to_rows
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.naive_bayes import GaussianNB
	import numpy as np
	import pandas as pd
	import logging as l
	l.basicConfig(level=l.DEBUG)
	prefix = "./exp8/"
	news_train = 'news_train.csv'
	news_train_subset = "news_train_subset.csv"
	news_test = "news_test.csv"

	''' use the small scale input to test the process '''
	# news_train = news_train_subset

	# 读取原始数据，指定UTF-8编码
	news_train_df = pd.read_csv(prefix + news_train, encoding='utf-8')
	news_test_df = pd.read_csv(prefix+news_test, encoding="utf-8")


	x_train = news_train_df['text']
	len_train=len(x_train)
	# print(len_train)
	x_test=news_test_df['text']
	# print(x_test)
	# x=pd.merge(x,x_test,how="cross")
	''' concat the two files
	because the tf-idf algorithm reference to the corpus,so if you want to predict the news' validity ,you should create a corpus base all sentences'''
	x_whole=pd.concat([x_train,x_test])

	# print("new shape of x:")
	# print(news_train_series)
	# l.debug(f"{x}")
	y = news_train_df['label']
	#get the numeric label vector
	y=[1 if validity=='REAL' else 0 for validity in y ]

	# news_test_series = news_test_df['text']
	# print(label_train_series)
	# label_test_series=news_test_df['label']#this is waiting for solve(predict)

	# news_train_series = np.array(news_train_series)
	# print(news_train_series.shape)
	# print(news_train_series)
	# label_train_series = np.array(label_train_series)
	# print(label_train_nd[1])

	corpus = x_whole
	# corpus_test=x_test
	vectorizer = TfidfVectorizer()
	# vectorizer_test=TfidfVectorizer()
	v = vectorizer.fit_transform(corpus)
	# v_test=vectorizer_test.fit_transform(corpus_test)
	#get numeric x vector(ndarry)
	x_whole_vectors=v.toarray()
	# print(x_whole_vectors)
	x_train_vectors=x_whole_vectors[:len_train]
	x_test_vectors=x_whole_vectors[len_train:]


	clf_GNB = GaussianNB()
	y_test_estimate_real=[]
	def estimate_accuracy():
	''' estimate the accuracy: '''
	estimate_number = int(0.9*len_train)
	x_estimate = x_whole_vectors[:estimate_number]
	y_estimate = y[:estimate_number]
	# #fix the shape
	x_test_estimate = x_whole_vectors[estimate_number:len_train]
	global y_test_estimate_real
	y_test_estimate_real = y[estimate_number:len_train]
	''' estimate clf and operation:,if you want predict the finally result,the comment the following lines '''
	clf_GNB.fit(x_estimate,y_estimate)
	global y_predict
	y_predict=clf_GNB.predict(x_test_estimate)

	''' # change the references to estimate '''
	# x = x_estimate
	# y = y_estimate
	y_predict=[]
	def predict():
	''' the ultimately prediction classifer and predict operation '''
	clf_GNB.fit(x_train_vectors, y)
	global y_predict
	y_predict = clf_GNB.predict(x_test_vectors)


	print("prediction:")
	def output_prediction():
	with open(prefix+"pred.txt","w") as fos:
	for validity in y_predict:
	if validity==1:
	fos.write("REAL\n")
	print("REAL",end=" ")
	else:
	fos.write("FAKE\n")
	print("FAKE",end=" ")
	print()

	estimate_accuracy()
	# predict()
	output_prediction()

	def print_for_estimate_accuracy():
	''' begin estimate the prediction accuracy:80% or so '''
	count=0
	for i,j in zip(y_test_estimate_real,y_predict):
	if i!=j:
	count+=1
	# print(i,j)
	predicts_numbers=len(y_predict)
	print(predicts_numbers)
	print((predicts_numbers-count)/predicts_numbers)
	print_for_estimate_accuracy()