Pandas
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.engine import Engine
import traceback
import math

class MSSQL:
	host = "10.251.21.1XXX"
	port = 3306
	dbname = "AnyXXXX"
	user = "root"
	passwd = "AXXXXXX"

def connect_AS_DB():
	try:
		conn = create_engine ('mysql+pymysql://{user}:{password}@{host}:{port}/{dbname}'.format(user=MSSQL.user,
																					password=MSSQL.passwd,
																					host=MSSQL.host,
																					port=int(MSSQL.port),
																					dbname=MSSQL.dbname),echo = False)
		return conn

	except Exception as e:
		print("Connect to MSSQL exception:%s"% traceback.format_exc())
		return None


if __name__ == '__main__':
	#定义每次分割10行
	lines=10 
	try:
		conn =connect_AS_DB()
	except Exception as E:
		print(E)
	
	df= pd.read_csv(r'C:\LocalTemp\Python 拆分大文件\操作日志.2022-4-9~2022-4-19.csv',iterator=False, error_bad_lines=True,encoding='utf-8',names=['DateTime', 'UserName', 'Catelog', 'Action', 'IP', 'Unknown','Record', 'Path', 'Client', 'obj_id'] )
	# print(DataInfo_iterator["DateTime"])


	for i in range(0, math.ceil(df.shape[0]/lines)):
		DataInfo = df.iloc[i*lines:(i+1)*lines]
		if conn:
			try:
				print(DataInfo["DateTime"])
				print('*'*50+str(i)+'*'*50)
				# DataInfo.to_sql('ASLog', conn, if_exists="append", index=False)

			except Exception as e:
				print(e)
		else :
			print("connect to sql error")



版本做了优化 支持多个文件的导入,同时解决第九列文件内含有“,”的问题

点击查看代码
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.engine import Engine
import traceback
import math

class MSSQL:
	host = "10.251.21.XXX"
	port = 3306
	dbname = "Anyshare"
	user = "root"
	passwd = "AlXXX"

#链接Anyshare 日志数据库
def connect_AS_DB():
	try:
		conn = create_engine ('mysql+pymysql://{user}:{password}@{host}:{port}/{dbname}'.format(user=MSSQL.user,
																					password=MSSQL.passwd,
																					host=MSSQL.host,
																					port=int(MSSQL.port),
																					dbname=MSSQL.dbname),echo = False)
		return conn
	except Exception as e:
		print("Connect to MSSQL exception:%s"% traceback.format_exc())
		return None




def Writeto_Sql(arr_Lists,columns_Num):
	for arr in arr_Lists:
		print(arr[0])
		df=pd.DataFrame(arr,columns=columns_Num )
		df.to_sql('ASLog', conn, if_exists="append", index=False)
		# print(df)



#将以迭代返回列表的形式返回数据
def get_CleanFileInfo(File_list):
	for file_Name in File_list:
		
		with open(r'C:\LocalTemp\Python 拆分大文件\{file}'.format(file=file_Name),"r", encoding="utf-8") as f:
			lines=f.readlines()
			i=0
		arr_List=[]
		for line in lines:
			i+=1
			
			vals=line.split(',')
			if len(vals)==10:
				arr_List.append(vals)
				# arr_List=vals
			elif len(vals)>10:
				arr_List.append(vals[:8]+[','.join(vals[8:-1])]+vals[-1:])
			if i % 100==0: #数据库commit 需要消耗时间,将这里调整从1 调整到100后明显发现CPU 、内存消耗更多,速度也更快
				yield(arr_List)
				arr_List=[]
		yield(arr_List)



if __name__ == '__main__':
	try:
		conn =connect_AS_DB()
	except Exception as E:
		print(E)
	#数据有效列数
	columns=['DateTime', 'UserName', 'Catelog', 'Action', 'IP', 'MAC','Record', 'Path', 'Client', 'obj_id']
	#待导入文件列表
	File_list=["操作日志.2022-1.csv","操作日志.2022-2.csv","操作日志.2022-3.csv","操作日志.2022-4-1~2022-4-3.csv","操作日志.2022-4-3~2022-4-9.csv","操作日志.2022-4-9~2022-4-19.csv"]


	#获取返回后的文件信息列表,返回迭代器
	infoline_iter=get_CleanFileInfo(File_list)
	#将获取的信息写入数据库
	Writeto_Sql(infoline_iter,columns)
posted on 2022-04-21 11:08  vmsky  阅读(260)  评论(0编辑  收藏  举报