python写入数据到excel,写入百万数据&读取数据,并取出重复数据

现实遇到的问题

在测试导入excel数据时,需要自己造数据,如果写入大批量数据不可能自己一个个造,所以写了一个写入数据到excel的demo

下面为写入excel的demo

import os
import random
from faker import Faker

file_path = os.getcwd()
print(file_path)
f = Faker(locale='en_US')


class GenExcelUser(object):

    def __init__(self, num=10, type=1, scenario=0):
        """
        :param num: 生成手机号数量
        :type 1为导入用户,excel中有三个字段:买家账号,买家手机号,用户标记;2为黑名单用户,只有手机号一个字段
        :scenario 只针对导入用户生效,默认为正常数据,即买家账号,手机号,标记均存在
        1:最后一条无手机号;2:最后一条无昵称:3:全无手机号;4:全无昵称;5:昵称手机号均为空
        """
        self.dirname = '\导入测试数据.xlsx'
        self.buername = 'autotest'
        self.num = num
        self.type = type
        self.lable = '50w_mark'  # 标签名
        self.scenario = scenario

    def gen_phone(self):
        a = random.sample(range(0, 10), 8)
        ll = [str(i) for i in a]
        res = ('179' + ''.join(ll))
        return res

    def generate_random_str(self):
        """
        :return: 随机字符串 返回3位字符串
        """
        import string
        # a-z小写列表
        lower_str = list(string.ascii_lowercase)
        # a-z大写列表
        up_str = list(string.ascii_uppercase)
        # 0-9的列表,列表对应的为字符串
        int_str = [str(i) for i in range(10)]

        # 两个列表合并,在小写列表中做拓展
        lower_str.extend(up_str)
        lower_str.extend(int_str)

        # 在列表中随机选取7个元素
        ff = random.sample(lower_str, 7)
        return ''.join(ff)

    def gene_data(self):
        """
        导入用户
        """
        buername = self.buername + '_' + str(self.generate_random_str())
        phone = self.gen_phone()
        lable = self.lable  # 这边定义写死后,后面不管这么取,都为默认值
        return locals()

    # 弃用 该方法,该方法写入十万级别数据没问题,百万级别就会报错
    # def main(self, num=100):
    #     import pandas as pd
    #     from pandas import DataFrame
    #
    #     list_name = [self.gene_data()['buername'] for i in range(num)]
    #     list_phone = [self.gene_data()['phone'] for i in range(num)]
    #     list_lable = [self.gene_data()['lable'] for i in range(num)]
    #     # 写
    #     dic1 = {'账号': list_name,
    #             '手机号': list_phone,
    #             '标记': list_lable
    #             }
    #     df = pd.DataFrame(dic1)
    #     df.to_excel(self.dirname, index=False)
    #     print('写入完成')

    def xw_toExcel(self, num=100):  # xlsxwriter库储存数据到excel
        import xlsxwriter as xw
        fileName = file_path+self.dirname

        workbook = xw.Workbook(fileName)  # 创建工作簿
        worksheet1 = workbook.add_worksheet("sheet1")  # 创建子表
        worksheet1.activate()  # 激活表
        title = ['账号', '手机号', '标记']  # 设置表头
        worksheet1.write_row('A1', title)  # 从A1单元格开始写入表头
        i = 2  # 从第二行开始写入数据
        data = [self.gene_data() for i in range(num)]
        for j in range(len(data)):
            insertData = [data[j]["buername"], data[j]["phone"], data[j]["lable"]]
            row = 'A' + str(i)
            worksheet1.write_row(row, insertData)
            i += 1
        workbook.close()  # 关闭表


if __name__ == '__main__':
    import time
    num = input('请输入需要导入数量: ')
    start_time = time.time()
    cl = GenExcelUser()
    cl.xw_toExcel(num=int(num))
    end_time = time.time()
    print('总计耗时%s' % (end_time - start_time))

由于写入数据是随机的,无法保证数据是否有重复值,所以就写了一个解析excel文件,查看是否有重复数据

代码如下

import pandas as pd
import time

file_path = r'./导入测试数据.xlsx'
file_path1 = r'./测试.xlsx'


def excel_one_line_to_list():
    """
    读取excel文件,读取第一列数据
    """
    df = pd.read_excel(file_path, usecols=[0],
                       names=None)  # 读取项目名称列,不要列名,读取两列则usecols=[0,1],对应列
    df_li = df.values.tolist()
    result = []
    for s_li in df_li:
        result.append(s_li[0])

    print('列表数量为%d' % (len(result)))
    return result


def compar_data(data: list):
    """
    判断是否有重复数据
    """
    num = 1
    dic1 = {}
    train_data = [i for i in data]
    for i in train_data:
        if i not in dic1.keys():
            dic1[i] = num
        else:
            dic1[i] += 1
    ll = []
    for k, v in dic1.items():
        if v == 2 or v > 2:
            ll.append(k)
    return ll


if __name__ == '__main__':
    start_time = time.time()
    train_data = excel_one_line_to_list()
    ss = compar_data(train_data)
    print('重复数量为%s,重复值列表为%s' % (str(len(ss)),str(ss)))
    end_time = time.time()
    print('总计耗时%s' % (end_time - start_time))
posted @ 2021-08-24 19:38  happy-winds  阅读(1306)  评论(0编辑  收藏  举报