[DJANGO] excel十几万行数据快速导入数据库研究
先贴原来的导入数据代码:
8 import os os.environ.setdefault("DJANGO_SETTINGS_MODULE", "www.settings") ''' Django 版本大于等于1.7的时候,需要加上下面两句 import django django.setup() 否则会抛出错误 django.core.exceptions.AppRegistryNotReady: Models aren't loaded yet. ''' import django if django.VERSION >= (1, 7):#自动判断版本 django.setup() from arrears.models import D072Qf import xlrd #excel读工具 from datetime import datetime from xlrd import xldate_as_tuple import time import random time1 = time.time() #data= xlrd.open_workbook('11.xlsx') 打开文件 with xlrd.open_workbook('11.xlsx') as data: print u"读取文件结束,开始导入!" time2 = time.time() table = data.sheet_by_index(0) #获取工作表 time3 = time.time() n=1 x = y = z = 0 WorkList = [] for line in range(n,table.nrows):#nrows = table.nrows #行数 ncols = table.ncols #列数 print sh.row_values(rownum) row = table.row_values(line) if row: #查看行值是否为空 for i in [0,1,2,4,28,30,32]: if type(row[i]) == float: row[i] = int(row[i]) if D072Qf.objects.filter(acct_month = row[0],serv_id=row[1]).exists():#判断该行值是否在数据库中重复 x = x + 1 #重复值计数 else: WorkList.append(D072Qf(acct_month=row[0],serv_id=row[1],acc_nbr=row[2],user_name=row[3],acct_code=row[4], acct_name=row[5],product_name=row[6],current_charge=row[7],one_charge=row[8], two_charge=row[9],three_charge=row[10],four_charge=row[11],five_charge=row[12], six_charge=row[13],seven_charge=row[14],eight_charge=row[15],nine_charge=row[16], ten_charge=row[17],eleven_charge=row[18],twelve_charge=row[19],oneyear_charge=row[20], threeyear_charge=row[21],upthreeyear_charge=row[22],all_qf=row[23],morethree_qf=row[24], aging=row[25],serv_state_name=row[26],mkt_chnl_name=row[27],mkt_chnl_id=row[28], mkt_region_name=row[29],mkt_region_id=row[30],mkt_grid_name=row[31],mkt_grid_id=row[32], prod_addr=row[33])) y = y + 1 #非重复计数 else: z = z + 1 #空行值计数 n = n + 1 if n % 9999 == 0: D072Qf.objects.bulk_create(WorkList) WorkList = [] time.sleep(random.random()) #让Cpu随机休息0 <= n < 1.0 s print "导入成功一次!" print '数据导入成功,导入'+str(y)+'条,重复'+str(x)+'条,有'+str(z)+'行为空!' time4 = time.time() print "读取文件耗时"+str(time2-time1)+"秒,导入数据耗时"+str(time4-time3)+"秒!"
这条代码目前未全部将十几万行数据全部导入数据库中,只花了1个小时把5万行数据导入其中后,后面越来越慢,主要慢在excel表到了7万行数据左右后,读取excel中数据很慢了,总体来说影响导入速度有几个原因:
1、一直以来采用xlrd导入xls格式文件,如果文件有十几万行,只是读取文件就会花200秒,若换成csv则几乎不花时间
2、代码中这行语句也会影响速度,特别当数据库中数据很大时:if D072Qf.objects.filter(acct_month = row[0],serv_id=row[1]).exists():#判断该行值是否在数据库中重复
3、若一次性将字典添加十几万行数据,就windows的cpu而已是遭受不住的!所以建议1万条数据导入一次后,清空列表
改善后的代码:
优化部分:采用csv格式;取消掉检查重复数据语句;每5万导入一次数据
#coding:utf-8 import os os.environ.setdefault("DJANGO_SETTINGS_MODULE", "www.settings") ''' Django 版本大于等于1.7的时候,需要加上下面两句 import django django.setup() 否则会抛出错误 django.core.exceptions.AppRegistryNotReady: Models aren't loaded yet. ''' import django if django.VERSION >= (1, 7):#自动判断版本 django.setup() from arrears.models import D072Qf import time import random time1 = time.time() f = open('11.csv') print u"读取文件结束,开始导入!" time2 = time.time() WorkList = [] next(f) #将文件标记移到下一行 y = 0 n = 1 for line in f: row = line.replace('"','') #将字典中的"替换空 row = row.split(';') #按;对字符串进行切片 y = y + 1 WorkList.append(D072Qf(acct_month=row[0],serv_id=row[1],acc_nbr=row[2],user_name=row[3],acct_code=row[4], acct_name=row[5],product_name=row[6],current_charge=row[7],one_charge=row[8], two_charge=row[9],three_charge=row[10],four_charge=row[11],five_charge=row[12], six_charge=row[13],seven_charge=row[14],eight_charge=row[15],nine_charge=row[16], ten_charge=row[17],eleven_charge=row[18],twelve_charge=row[19],oneyear_charge=row[20], threeyear_charge=row[21],upthreeyear_charge=row[22],all_qf=row[23],morethree_qf=row[24], aging=row[25],serv_state_name=row[26],mkt_chnl_name=row[27],mkt_chnl_id=row[28], mkt_region_name=row[29],mkt_region_id=row[30],mkt_grid_name=row[31],mkt_grid_id=row[32], prod_addr=row[33])) n = n + 1 if n%50000==0: print n D072Qf.objects.bulk_create(WorkList) WorkList = [] time3 = time.time() print "读取文件耗时"+str(time2-time1)+"秒,导入数据耗时"+str(time3-time2)+"秒!" time3 = time.time() print n D072Qf.objects.bulk_create(WorkList) print "读取文件耗时"+str(time2-time1)+"秒,导入数据耗时"+str(time3-time2)+"秒!" WorkList = [] print "成功导入数据"+str(y)+"条" f.close()
结果让人大吃一惊!!!,只耗时73秒
Python 2.7.10 (default, May 23 2015, 09:40:32) [MSC v.1500 32 bit (Intel)] on win32 Type "copyright", "credits" or "license()" for more information. >>> ================================ RESTART ================================ >>> 读取文件结束,开始导入! 50000 读取文件耗时0.0秒,导入数据耗时34.3279998302秒! 100000 读取文件耗时0.0秒,导入数据耗时67.3599998951秒! 138400 读取文件耗时0.0秒,导入数据耗时73.4379999638秒! 成功导入数据138399条 >>>
study just for life!