使用python插入5000w数据到mysql
应用场景,数据库数据达到千万以后,做数据分析执行脚本开始变慢,所以用到clickhouse,先插入5000w数据测试
先安装pip支持mysql.connector,我安装的python3.11
pip3 install mysql-connector-python
再使用pip3 list
查看是否安装成功
以下是mysql创建表语句
CREATE TABLE `test_table` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`user_id` int(11) DEFAULT NULL,
`recharge_amount` int(11) DEFAULT NULL,
`bonus` decimal(12,2) DEFAULT NULL,
`message` varchar(255) DEFAULT NULL,
`content` varchar(255) DEFAULT NULL,
`remark` varchar(255) DEFAULT NULL,
`order_id` varchar(32) DEFAULT NULL,
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=50000001 DEFAULT CHARSET=utf8;
python代码如下:
import mysql.connector
import random
import string
import datetime
# MySQL 连接信息
config = {
'user': 'root',
'password': 'root',
'host': '127.0.0.1',
'database': 'db1',
}
# 生成模拟数据
def generate_data(batch_size=10000):
for i in range(batch_size):
# 生成模拟数据
yield [random_string(32),random_string(16),random_string(64),getTimeStr(),random.randint(1000000, 9999999)]
#生成随机字符串
def random_string(length):
letters = string.ascii_lowercase + string.digits
return ''.join(random.choice(letters) for i in range(length))
def getTimeStr():
# 获取当前日期和时间
now = datetime.datetime.now()
# 格式化为YmdHis格式
formatted_time = now.strftime("%Y%m%d%H%M%S")
# 将日期和时间转换为整数
timestamp = int(now.timestamp())
return formatted_time+str(timestamp)
def insertData():
connection = mysql.connector.connect(**config)
cursor = connection.cursor()
# 数据总量
total = 50000000
# 每批次插入的数据量
batch_size = 100000
# 循环插入数据
for i in range(0, total, batch_size):
# 准备SQL语句,插入一批数据到表格中
sql = "INSERT INTO test_table (message, content, remark, order_id,user_id,recharge_amount,bonus) VALUES "
values = []
for j in range(batch_size):
values.append((random_string(32),random_string(16),random_string(64),getTimeStr(),random.randint(1000000, 9999999),random.randint(1, 10000),random.uniform(1.0, 100.0)))
sql += ", ".join(["(%s, %s,%s,%s,%s,%s,%s)"] * len(values))
val = [item for sublist in values for item in sublist]
cursor.execute(sql, val)
# 提交事务
connection.commit()
# 输出当前进度信息
print("{:.2f}% 完成".format((i + batch_size) / total * 100))
print(cursor.rowcount, "记录插入成功。")
if __name__ == '__main__':
insertData()