postgresql 大数据批量插入数据库实战

# 应用场景:

  一般情况下插入数据库的记录也就几条到几十上百条而已,这种情况下可能就写一个脚本S秒搞定的事,但是给你一个文件里面有几十W行的数据让你插入数据库呢?甚至这样的文件不至一个,甚至要插入数据库的字段就有上百个你会怎么做?

那么问题来了就肯定会有解决的办法啊,只要思想不滑坡,办法总比困难多啊,这不小白经过苦苦搜索和不断的踩坑终于找到高效的方法让这些大数据不到1小时的时间就全都插入到数据库了,废话不多说,下面看代码吧....

import os
import psycopg2, psycopg2.extras

#
读取所有解压的原始数据文件到一个目录里,因为我有多个压缩的大数据文件 xdrpath = r"C:\Users\10907\Downloads\xdr_data" def walkFile(xdrpath): xdrlist = [] for root, dirs, files in os.walk(xdrpath): # 遍历文件 for f in files: xdrlist.append(os.path.join(root, f)) return xdrlist walkFile(xdrpath) # 这里我是将所有要插入数据库的字段名全都拿出来放在一个列表里,然后再将数据文件中的数据提取出来,每一行就是一条数据,然后再用zip(字段名列表,值列表)组成每条单独的字典,键值对足有200多个呢?够多吧 lst_key = ['interface', 'xdrid', 'imsi', 'imei', 'msisdn', 'user_ip', 'lac', 'rac', 'cid', 'sgsn_c_ip', 'sgsn_u_ip', 'ggsn_c_ip', 'ggsn_u_ip', 'rat', 'apn', 'xdr_type', 'procedure_id', 'start_time', 'end_time', 'prot_category', 'prot_type', 'l4_type', 'ms_port', 'server_ip', 'server_port', 'mcc', 'mnc', 'l4_ul_throughput', 'l4_dw_throughput', 'l4_ul_goodput', 'l4_dw_goodput', 'tcp_wtp_ul_outofsequ', 'tcp_wtp_dw_outofsequ', 'tcp_wtp_ul_retrans', 'tcp_wtp_dw_retrans', 'ul_ip_frag_packets', 'dl_ip_frag_packets', 'vedio_speed', 'success_play', 'first_play_time', 'stop_count', 'chargeid', 'protocoltype', 'sgw_ggsn_port', 'enb_sgsn_port', 'enb_sgsn_gtp_teid', 'sgw_ggsn_gtp_teid', 'app_content', 'app_status', 'tcp_rtt_step1', 'tcp_rtt', 'ms_ack_to_1stget_delay', 'fristtransreptorspdelay', 'tcp_win_size', 'tcp_mss', 'tcp_conn_times', 'tcp_conn_states', 'homemcc', 'homemnc', 'roam_direction', 'sgsn_id', 'ggsn_id', 'ran_ne_id', 'layer1id', 'layer2id', 'layer3id', 'layer4id', 'layer5id', 'layer6id', 'sai_cgi_ecgi', 'tac', 'enb_ue_s1ap_id', 'm_tmsi', 'other_tac', 'other_eci', 'updura', 'downdura', 'datadura', 'mme_ue_s1ap_id', 'user_ipv4', 'user_ipv6', 'server_ipv4', 'server_ipv6', 'machine_ip_add_type', 'homeprocode', 'homecitycode', 'roamprocode', 'roamcitycode', 'roam_type', 'service_end', 'tcp_states', 'tcp_conn_2_failed_times', 'tcp_conn_3_failed_times', 'tcp_syn_time', 'tcp_syn_time_msel', 'avg_ul_rtt', 'avg_dw_rtt', 'ul_rtt_stat_num', 'dw_rtt_stat_num', 'ul_rtt_long_num', 'dw_rtt_long_num', 'user_probe_dw_lost_pkt', 'user_probe_ul_lost_pkt', 'server_probe_ul_lost_pkt', 'server_probe_dw_lost_pkt', 'tcp_rtt_step2', 'syn_count', 'syn_ack_count', 'first_rst_time', 'first_fin_time', 'first_rst_dir', 'first_fin_dir', 'ul_rst_count', 'dl_rst_count', 'ul_fin_count', 'dl_fin_count', 'ul_packet_tcp_payload', 'dl_packet_tcp_payload', 'tcp_conn_slice_flag', 'probe_id', 'carrier_id', 'sub_app_id', 'probe_prot_id', 'first_sai_cgi_ecgi', 'first_rat', 'last_sai_cgi_ecgi', 'last_rat', 'first_longitude', 'first_latitude', 'first_altitude', 'first_rasterlongitude', 'first_rasterlatitude', 'first_rasteraltitude', 'first_frequencyspot', 'first_clutter', 'first_userbehavior', 'first_speed', 'first_credibility', 'last_longitude', 'last_latitude', 'last_altitude', 'last_rasterlongitude', 'last_rasterlatitude', 'last_rasteraltitude', 'last_frequencyspot', 'last_clutter', 'last_userbehavior', 'last_speed', 'last_credibility', 'sessionkey', 'first_ucellid', 'last_ucellid', 'request_time', 'resource_name', 'streaming_rate', 'play_success', 'initbuffer_duration', 'video_down_octets', 'video_down_time', 'file_type', 'video_frame_rate', 'video_width', 'video_height', 'video_codec_id', 'stall_duration', 'max_buffering_rate', 'max_playing_rate', 'streaming_duratioin', 'record_type', 'longitude', 'latitude', 'altitude', 'coordinate_system', 'pgw_ip', 'pgw_port', 'repetition', 'videoid', 'video_clarity', 'tcp_syn_ack_num', 'tcp_ack_num', 'tcp1_2hnadshake_status', 'tcp2_3hnadshake_status', 'device_brand', 'device_model', 'device_type', 'custom_protocol_id', 'play_duration', 'start_time_us', 'end_time_us', 'p_tmsi', 'bsc_rnc_signal_ip', 'bsc_rnc_user_ip', 'ul_probeid', 'ul_link_index', 'dl_probeid', 'dl_link_index', 'transactionid', 'flow_control', 'user_account', 'refer_xdr_id', 'rule_source'] # xdr数据解析,解析后的数据是一个一个的字典,每个字典是一条单独的记录 def XdrList(file_path): dic_lst = [] # with open(file_path, 'r', encoding='utf-8') as f: i = 0 with open(file_path, 'rb') as f: # 注意点: 原文件数据并不是字节流数据,而是正常的uft-8数据,那这儿为什么要用rb模式读取,因为它快啊,一个几十W行的数据文件,你用r模式读取试试,慢的让你抓狂,所有这儿用rb for line in f: new_line = line.decode().replace('|','',1).replace('|', ',').strip().replace('NULL','') data_line = new_line.startswith('11') if data_line == True: lst_val = new_line.split(',') dic = {k: v.replace(' ', '') for k, v in zip( lst_key,lst_val )} if (len(dic)) < 201: del dic else: dic_lst.append(dic) i += 1 return dic_lst # 以上代码就是处理好的所有字典列表的数据文件了,然后把结果传到下面的批量插入脚本里面执行就OK了

# 重点在这里: 批量插入数据库函数,其中使用的是第三方模块,请看标黄色的地方. def ins_db(rows,file_name): # prod conn = psycopg2.connect(database="qoe", user="postgres", password="cmcc1234", host="172.30.206.7", port="15434") cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) # cursor = conn.cursor() # 批量入库sql语句 sql = 'insert into gx_videoxdr(interface,xdrid,imsi,imei,msisdn,user_ip,lac,rac,cid,sgsn_c_ip,sgsn_u_ip,ggsn_c_ip,ggsn_u_ip,rat,apn,xdr_type,procedure_id,start_time,end_time,prot_category,prot_type,l4_type,ms_port,server_ip,server_port,mcc,mnc,l4_ul_throughput,l4_dw_throughput,l4_ul_goodput,l4_dw_goodput,tcp_wtp_ul_outofsequ,tcp_wtp_dw_outofsequ,tcp_wtp_ul_retrans,tcp_wtp_dw_retrans,ul_ip_frag_packets,dl_ip_frag_packets,vedio_speed,success_play,first_play_time,stop_count,chargeid,protocoltype,sgw_ggsn_port,enb_sgsn_port,enb_sgsn_gtp_teid,sgw_ggsn_gtp_teid,app_content,app_status,tcp_rtt_step1,tcp_rtt,ms_ack_to_1stget_delay,fristtransreptorspdelay,tcp_win_size,tcp_mss,tcp_conn_times,tcp_conn_states,homemcc,homemnc,roam_direction,sgsn_id,ggsn_id,ran_ne_id,layer1id,layer2id,layer3id,layer4id,layer5id,layer6id,sai_cgi_ecgi,tac,enb_ue_s1ap_id,m_tmsi,other_tac,other_eci,updura,downdura,datadura,mme_ue_s1ap_id,user_ipv4,user_ipv6,server_ipv4,server_ipv6,machine_ip_add_type,homeprocode,homecitycode,roamprocode,roamcitycode,roam_type,service_end,tcp_states,tcp_conn_2_failed_times,tcp_conn_3_failed_times,tcp_syn_time,tcp_syn_time_msel,avg_ul_rtt,avg_dw_rtt,ul_rtt_stat_num,dw_rtt_stat_num,ul_rtt_long_num,dw_rtt_long_num,user_probe_dw_lost_pkt,user_probe_ul_lost_pkt,server_probe_ul_lost_pkt,server_probe_dw_lost_pkt,tcp_rtt_step2,syn_count,syn_ack_count,first_rst_time,first_fin_time,first_rst_dir,first_fin_dir,ul_rst_count,dl_rst_count,ul_fin_count,dl_fin_count,ul_packet_tcp_payload,dl_packet_tcp_payload,tcp_conn_slice_flag,probe_id,carrier_id,sub_app_id,probe_prot_id,first_sai_cgi_ecgi,first_rat,last_sai_cgi_ecgi,last_rat,first_longitude,first_latitude,first_altitude,first_rasterlongitude,first_rasterlatitude,first_rasteraltitude,first_frequencyspot,first_clutter,first_userbehavior,first_speed,first_credibility,last_longitude,last_latitude,last_altitude,last_rasterlongitude,last_rasterlatitude,last_rasteraltitude,last_frequencyspot,last_clutter,last_userbehavior,last_speed,last_credibility,sessionkey,first_ucellid,last_ucellid,request_time,resource_name,streaming_rate,play_success,initbuffer_duration,video_down_octets,video_down_time,file_type,video_frame_rate,video_width,video_height,video_codec_id,stall_duration,max_buffering_rate,max_playing_rate,streaming_duratioin,record_type,longitude,latitude,altitude,coordinate_system,pgw_ip,pgw_port,repetition,videoid,video_clarity,tcp_syn_ack_num,tcp_ack_num,tcp1_2hnadshake_status,tcp2_3hnadshake_status,device_brand,device_model,device_type,custom_protocol_id,play_duration,start_time_us,end_time_us,p_tmsi,bsc_rnc_signal_ip,bsc_rnc_user_ip,ul_probeid,ul_link_index,dl_probeid,dl_link_index,transactionid,flow_control,user_account,refer_xdr_id,rule_source) values %s on conflict (xdrid) do nothing' # psycopg2.extras.execute_batch(cursor, sql, rows, page_size=100) try: print(file_name,'rows: ',len(rows)) psycopg2.extras.execute_values(cursor, sql, rows, template='(%(interface)s,%(xdrid)s,%(imsi)s,%(imei)s,%(msisdn)s,%(user_ip)s,%(lac)s,%(rac)s,%(cid)s,%(sgsn_c_ip)s,%(sgsn_u_ip)s,%(ggsn_c_ip)s,%(ggsn_u_ip)s,%(rat)s,%(apn)s,%(xdr_type)s,%(procedure_id)s,%(start_time)s,%(end_time)s,%(prot_category)s,%(prot_type)s,%(l4_type)s,%(ms_port)s,%(server_ip)s,%(server_port)s,%(mcc)s,%(mnc)s,%(l4_ul_throughput)s,%(l4_dw_throughput)s,%(l4_ul_goodput)s,%(l4_dw_goodput)s,%(tcp_wtp_ul_outofsequ)s,%(tcp_wtp_dw_outofsequ)s,%(tcp_wtp_ul_retrans)s,%(tcp_wtp_dw_retrans)s,%(ul_ip_frag_packets)s,%(dl_ip_frag_packets)s,%(vedio_speed)s,%(success_play)s,%(first_play_time)s,%(stop_count)s,%(chargeid)s,%(protocoltype)s,%(sgw_ggsn_port)s,%(enb_sgsn_port)s,%(enb_sgsn_gtp_teid)s,%(sgw_ggsn_gtp_teid)s,%(app_content)s,%(app_status)s,%(tcp_rtt_step1)s,%(tcp_rtt)s,%(ms_ack_to_1stget_delay)s,%(fristtransreptorspdelay)s,%(tcp_win_size)s,%(tcp_mss)s,%(tcp_conn_times)s,%(tcp_conn_states)s,%(homemcc)s,%(homemnc)s,%(roam_direction)s,%(sgsn_id)s,%(ggsn_id)s,%(ran_ne_id)s,%(layer1id)s,%(layer2id)s,%(layer3id)s,%(layer4id)s,%(layer5id)s,%(layer6id)s,%(sai_cgi_ecgi)s,%(tac)s,%(enb_ue_s1ap_id)s,%(m_tmsi)s,%(other_tac)s,%(other_eci)s,%(updura)s,%(downdura)s,%(datadura)s,%(mme_ue_s1ap_id)s,%(user_ipv4)s,%(user_ipv6)s,%(server_ipv4)s,%(server_ipv6)s,%(machine_ip_add_type)s,%(homeprocode)s,%(homecitycode)s,%(roamprocode)s,%(roamcitycode)s,%(roam_type)s,%(service_end)s,%(tcp_states)s,%(tcp_conn_2_failed_times)s,%(tcp_conn_3_failed_times)s,%(tcp_syn_time)s,%(tcp_syn_time_msel)s,%(avg_ul_rtt)s,%(avg_dw_rtt)s,%(ul_rtt_stat_num)s,%(dw_rtt_stat_num)s,%(ul_rtt_long_num)s,%(dw_rtt_long_num)s,%(user_probe_dw_lost_pkt)s,%(user_probe_ul_lost_pkt)s,%(server_probe_ul_lost_pkt)s,%(server_probe_dw_lost_pkt)s,%(tcp_rtt_step2)s,%(syn_count)s,%(syn_ack_count)s,%(first_rst_time)s,%(first_fin_time)s,%(first_rst_dir)s,%(first_fin_dir)s,%(ul_rst_count)s,%(dl_rst_count)s,%(ul_fin_count)s,%(dl_fin_count)s,%(ul_packet_tcp_payload)s,%(dl_packet_tcp_payload)s,%(tcp_conn_slice_flag)s,%(probe_id)s,%(carrier_id)s,%(sub_app_id)s,%(probe_prot_id)s,%(first_sai_cgi_ecgi)s,%(first_rat)s,%(last_sai_cgi_ecgi)s,%(last_rat)s,%(first_longitude)s,%(first_latitude)s,%(first_altitude)s,%(first_rasterlongitude)s,%(first_rasterlatitude)s,%(first_rasteraltitude)s,%(first_frequencyspot)s,%(first_clutter)s,%(first_userbehavior)s,%(first_speed)s,%(first_credibility)s,%(last_longitude)s,%(last_latitude)s,%(last_altitude)s,%(last_rasterlongitude)s,%(last_rasterlatitude)s,%(last_rasteraltitude)s,%(last_frequencyspot)s,%(last_clutter)s,%(last_userbehavior)s,%(last_speed)s,%(last_credibility)s,%(sessionkey)s,%(first_ucellid)s,%(last_ucellid)s,%(request_time)s,%(resource_name)s,%(streaming_rate)s,%(play_success)s,%(initbuffer_duration)s,%(video_down_octets)s,%(video_down_time)s,%(file_type)s,%(video_frame_rate)s,%(video_width)s,%(video_height)s,%(video_codec_id)s,%(stall_duration)s,%(max_buffering_rate)s,%(max_playing_rate)s,%(streaming_duratioin)s,%(record_type)s,%(longitude)s,%(latitude)s,%(altitude)s,%(coordinate_system)s,%(pgw_ip)s,%(pgw_port)s,%(repetition)s,%(videoid)s,%(video_clarity)s,%(tcp_syn_ack_num)s,%(tcp_ack_num)s,%(tcp1_2hnadshake_status)s,%(tcp2_3hnadshake_status)s,%(device_brand)s,%(device_model)s,%(device_type)s,%(custom_protocol_id)s,%(play_duration)s,%(start_time_us)s,%(end_time_us)s,%(p_tmsi)s,%(bsc_rnc_signal_ip)s,%(bsc_rnc_user_ip)s,%(ul_probeid)s,%(ul_link_index)s,%(dl_probeid)s,%(dl_link_index)s,%(transactionid)s,%(flow_control)s,%(user_account)s,%(refer_xdr_id)s,%(rule_source)s)', page_size=5000) conn.commit() print('数据插入成功。。。OK') return True except Exception as e: print(e) print('插入失败。。。False') return False # 将xdr目录下的所有xdr文件传给xdrlist函数进行处理 for file_path in walkFile(xdrpath): file_name = file_path file_list = XdrList(file_path) ins_db(file_list,file_name)

运行结果:
'''

C:\Users\10907\Downloads\xdr_data\putty-0819 rows: 475738
数据插入成功。。。OK
C:\Users\10907\Downloads\xdr_data\putty-0820 rows: 166692
数据插入成功。。。OK
C:\Users\10907\Downloads\xdr_data\putty-0820-1 rows: 280644
数据插入成功。。。OK

...

'''

 

posted @ 2021-11-12 15:25  urls  阅读(1850)  评论(0编辑  收藏  举报